]> git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
removed unused function DPSOFTRAST_Draw_Span_Finish
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifdef _MSC_VER
10 #pragma warning(disable : 4324)
11 #endif
12
13 #ifndef __cplusplus
14 typedef qboolean bool;
15 #endif
16
17 #define ALIGN_SIZE 16
18 #define ATOMIC_SIZE 32
19
20 #ifdef SSE_POSSIBLE
21         #if defined(__APPLE__)
22                 #include <libkern/OSAtomic.h>
23                 #define ALIGN(var) var __attribute__((__aligned__(16)))
24                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
25                 #define MEMORY_BARRIER (_mm_sfence())
26                 #define ATOMIC_COUNTER volatile int32_t 
27                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30         #elif defined(__GNUC__)
31                 #define ALIGN(var) var __attribute__((__aligned__(16)))
32                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
33                 #define MEMORY_BARRIER (_mm_sfence())
34                 //(__sync_synchronize())
35                 #define ATOMIC_COUNTER volatile int
36                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
37                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
38                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
39         #elif defined(_MSC_VER)
40                 #define ALIGN(var) __declspec(align(16)) var
41                 #define ATOMIC(var) __declspec(align(32)) var
42                 #define MEMORY_BARRIER (_mm_sfence())
43                 //(MemoryBarrier())
44                 #define ATOMIC_COUNTER volatile LONG
45                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
46                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
47                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
48         #endif
49 #endif
50
51 #ifndef ALIGN
52 #define ALIGN(var) var
53 #endif
54 #ifndef ATOMIC
55 #define ATOMIC(var) var
56 #endif
57 #ifndef MEMORY_BARRIER
58 #define MEMORY_BARRIER ((void)0)
59 #endif
60 #ifndef ATOMIC_COUNTER
61 #define ATOMIC_COUNTER int
62 #endif
63 #ifndef ATOMIC_INCREMENT
64 #define ATOMIC_INCREMENT(counter) (++(counter))
65 #endif
66 #ifndef ATOMIC_DECREMENT
67 #define ATOMIC_DECREMENT(counter) (--(counter))
68 #endif
69 #ifndef ATOMIC_ADD
70 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
71 #endif
72
73 #ifdef SSE_POSSIBLE
74 #include <emmintrin.h>
75
76 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
77
78 static void *MM_CALLOC(size_t nmemb, size_t size)
79 {
80         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
81         if (ptr != NULL) memset(ptr, 0, nmemb*size);
82         return ptr;
83 }
84
85 #define MM_FREE _mm_free
86 #else
87 #define MM_MALLOC(size) malloc(size)
88 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
89 #define MM_FREE free
90 #endif
91
92 typedef enum DPSOFTRAST_ARRAY_e
93 {
94         DPSOFTRAST_ARRAY_POSITION,
95         DPSOFTRAST_ARRAY_COLOR,
96         DPSOFTRAST_ARRAY_TEXCOORD0,
97         DPSOFTRAST_ARRAY_TEXCOORD1,
98         DPSOFTRAST_ARRAY_TEXCOORD2,
99         DPSOFTRAST_ARRAY_TEXCOORD3,
100         DPSOFTRAST_ARRAY_TEXCOORD4,
101         DPSOFTRAST_ARRAY_TEXCOORD5,
102         DPSOFTRAST_ARRAY_TEXCOORD6,
103         DPSOFTRAST_ARRAY_TEXCOORD7,
104         DPSOFTRAST_ARRAY_TOTAL
105 }
106 DPSOFTRAST_ARRAY;
107
108 typedef struct DPSOFTRAST_Texture_s
109 {
110         int flags;
111         int width;
112         int height;
113         int depth;
114         int sides;
115         DPSOFTRAST_TEXTURE_FILTER filter;
116         int mipmaps;
117         int size;
118         ATOMIC_COUNTER binds;
119         unsigned char *bytes;
120         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
121 }
122 DPSOFTRAST_Texture;
123
124 #define COMMAND_SIZE ALIGN_SIZE
125 #define COMMAND_ALIGN(var) ALIGN(var)
126
127 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
128 {
129         unsigned char opcode;
130         unsigned short commandsize;
131 }
132 DPSOFTRAST_Command);
133
134 enum { DPSOFTRAST_OPCODE_Reset = 0 };
135
136 #define DEFCOMMAND(opcodeval, name, fields) \
137         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
138         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
139         { \
140                 unsigned char opcode; \
141                 unsigned short commandsize; \
142                 fields \
143         } DPSOFTRAST_Command_##name );
144
145 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
146 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
147
148 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
149 {
150         int freecommand;
151         int usedcommands;
152         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
153 }
154 DPSOFTRAST_State_Command_Pool);
155
156 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
157 {
158         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
159         float w[3];
160         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
161 }
162 DPSOFTRAST_State_Triangle);
163
164 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
165         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
166         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
167                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
168                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
169 }
170 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
171         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
172         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
173         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
174         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
175         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
176         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
177         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
178         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
179 }
180                                         
181 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
182
183 typedef ALIGN(struct DPSOFTRAST_State_Span_s
184 {
185         int triangle; // triangle this span was generated by
186         int x; // framebuffer x coord
187         int y; // framebuffer y coord
188         int startx; // usable range (according to pixelmask)
189         int endx; // usable range (according to pixelmask)
190         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
191         int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
192         int depthslope; // depthbuffer value pixel delta
193 }
194 DPSOFTRAST_State_Span);
195
196 #define DPSOFTRAST_DRAW_MAXSPANS 1024
197 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
198 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
199
200 #define DPSOFTRAST_VALIDATE_FB 1
201 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
202 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
203 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
204
205 typedef enum DPSOFTRAST_BLENDMODE_e
206 {
207         DPSOFTRAST_BLENDMODE_OPAQUE,
208         DPSOFTRAST_BLENDMODE_ALPHA,
209         DPSOFTRAST_BLENDMODE_ADDALPHA,
210         DPSOFTRAST_BLENDMODE_ADD,
211         DPSOFTRAST_BLENDMODE_INVMOD,
212         DPSOFTRAST_BLENDMODE_MUL,
213         DPSOFTRAST_BLENDMODE_MUL2,
214         DPSOFTRAST_BLENDMODE_SUBALPHA,
215         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
216         DPSOFTRAST_BLENDMODE_INVADD,
217         DPSOFTRAST_BLENDMODE_TOTAL
218 }
219 DPSOFTRAST_BLENDMODE;
220
221 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
222 {
223         void *thread;
224         int index;
225         
226         int cullface;
227         int colormask[4];
228         int blendfunc[2];
229         int blendsubtract;
230         int depthmask;
231         int depthtest;
232         int depthfunc;
233         int scissortest;
234         int alphatest;
235         int alphafunc;
236         float alphavalue;
237         int viewport[4];
238         int scissor[4];
239         float depthrange[2];
240         float polygonoffset[2];
241         float clipplane[4];
242         ALIGN(float fb_clipplane[4]);
243
244         int shader_mode;
245         int shader_permutation;
246         int shader_exactspecularmath;
247
248         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
249         
250         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
251         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
252
253         // DPSOFTRAST_VALIDATE_ flags
254         int validate;
255
256         // derived values (DPSOFTRAST_VALIDATE_FB)
257         int fb_colormask;
258         int fb_scissor[4];
259         ALIGN(float fb_viewportcenter[4]);
260         ALIGN(float fb_viewportscale[4]);
261
262         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
263         int fb_depthfunc;
264
265         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
266         int fb_blendmode;
267
268         // band boundaries
269         int miny1;
270         int maxy1;
271         int miny2;
272         int maxy2;
273
274         ATOMIC(volatile int commandoffset);
275
276         volatile bool waiting;
277         volatile bool starving;
278         void *waitcond;
279         void *drawcond;
280         void *drawmutex;
281
282         int numspans;
283         int numtriangles;
284         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
285         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
286         unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
287 }
288 DPSOFTRAST_State_Thread);
289
290 typedef ATOMIC(struct DPSOFTRAST_State_s
291 {
292         int fb_width;
293         int fb_height;
294         unsigned int *fb_depthpixels;
295         unsigned int *fb_colorpixels[4];
296
297         int viewport[4];
298         ALIGN(float fb_viewportcenter[4]);
299         ALIGN(float fb_viewportscale[4]);
300
301         float color[4];
302         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
303         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
304
305         const float *pointer_vertex3f;
306         const float *pointer_color4f;
307         const unsigned char *pointer_color4ub;
308         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
309         int stride_vertex;
310         int stride_color;
311         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
312         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
313         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
314
315         int firstvertex;
316         int numvertices;
317         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
318         float *screencoord4f;
319         int drawstarty;
320         int drawendy;
321         int drawclipped;
322         
323         int shader_mode;
324         int shader_permutation;
325         int shader_exactspecularmath;
326
327         int texture_max;
328         int texture_end;
329         int texture_firstfree;
330         DPSOFTRAST_Texture *texture;
331
332         int bigendian;
333
334         // error reporting
335         const char *errorstring;
336
337         bool usethreads;
338         int interlace;
339         int numthreads;
340         DPSOFTRAST_State_Thread *threads;
341
342         ATOMIC(volatile int drawcommand);
343
344         DPSOFTRAST_State_Command_Pool commandpool;
345 }
346 DPSOFTRAST_State);
347
348 DPSOFTRAST_State dpsoftrast;
349
350 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
351 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
352 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
353 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
354
355 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
356 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
357
358 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
359 {
360         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
361         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
362         fb_viewportcenter[3] = 0.5f;
363         fb_viewportcenter[0] = 0.0f;
364         fb_viewportscale[1] = 0.5f * viewport[2];
365         fb_viewportscale[2] = -0.5f * viewport[3];
366         fb_viewportscale[3] = 0.5f;
367         fb_viewportscale[0] = 1.0f;
368 }
369
370 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
371 {
372         if (dpsoftrast.interlace)
373         {
374                 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
375                 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
376                 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
377                 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
378         }
379         else
380         {
381                 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
382                 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
383         }
384 }
385
386 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
387 {
388         thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
389         thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
390         thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
391         thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
392         thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
393 }
394
395 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
396 {
397         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
398         // and viewport projection values
399         int x1, x2;
400         int y1, y2;
401         x1 = thread->scissor[0];
402         x2 = thread->scissor[0] + thread->scissor[2];
403         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
404         y2 = dpsoftrast.fb_height - thread->scissor[1];
405         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
406         if (x1 < 0) x1 = 0;
407         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
408         if (y1 < 0) y1 = 0;
409         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
410         thread->fb_scissor[0] = x1;
411         thread->fb_scissor[1] = y1;
412         thread->fb_scissor[2] = x2 - x1;
413         thread->fb_scissor[3] = y2 - y1;
414
415         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
416         DPSOFTRAST_RecalcClipPlane(thread);
417         DPSOFTRAST_RecalcThread(thread);
418 }
419
420 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
421 {
422         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
423 }
424
425 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
426 {
427         if (thread->blendsubtract)
428         {
429                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
430                 {
431                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
432                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
433                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
434                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
435                 }
436         }
437         else
438         {       
439                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
440                 {
441                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
442                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
443                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
444                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
445                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
446                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
447                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
448                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
449                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
450                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
451                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
452                 }
453         }
454 }
455
456 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
457
458 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
459 {
460         mask &= thread->validate;
461         if (!mask)
462                 return;
463         if (mask & DPSOFTRAST_VALIDATE_FB)
464         {
465                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
466                 DPSOFTRAST_RecalcFB(thread);
467         }
468         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
469         {
470                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
471                 DPSOFTRAST_RecalcDepthFunc(thread);
472         }
473         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
474         {
475                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
476                 DPSOFTRAST_RecalcBlendFunc(thread);
477         }
478 }
479
480 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
481 {
482         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
483                 return &dpsoftrast.texture[index];
484         return NULL;
485 }
486
487 static void DPSOFTRAST_Texture_Grow(void)
488 {
489         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
490         DPSOFTRAST_State_Thread *thread;
491         int i;
492         int j;
493         DPSOFTRAST_Flush();
494         // expand texture array as needed
495         if (dpsoftrast.texture_max < 1024)
496                 dpsoftrast.texture_max = 1024;
497         else
498                 dpsoftrast.texture_max *= 2;
499         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
500         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
501                 if (dpsoftrast.texbound[i])
502                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
503         for (j = 0; j < dpsoftrast.numthreads; j++)
504         {
505                 thread = &dpsoftrast.threads[j];
506                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
507                         if (thread->texbound[i])
508                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
509         }
510 }
511
512 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
513 {
514         int w;
515         int h;
516         int d;
517         int size;
518         int s;
519         int texnum;
520         int mipmaps;
521         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
522         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
523         DPSOFTRAST_Texture *texture;
524         if (width*height*depth < 1)
525         {
526                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
527                 return 0;
528         }
529         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
530         {
531                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
532                 return 0;
533         }
534         switch(texformat)
535         {
536         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
537         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
538         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
539                 break;
540         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
541                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
542                 {
543                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
544                         return 0;
545                 }
546                 if (depth != 1)
547                 {
548                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
549                         return 0;
550                 }
551                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
552                 {
553                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
554                         return 0;
555                 }
556                 break;
557         }
558         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
559         {
560                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
561                 return 0;
562         }
563         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
564         {
565                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
566                 return 0;
567         }
568         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
569         {
570                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
571                 return 0;
572         }
573         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
574         {
575                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
576                 return 0;
577         }
578         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
579         {
580                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
581                 return 0;
582         }
583         // find first empty slot in texture array
584         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
585                 if (!dpsoftrast.texture[texnum].bytes)
586                         break;
587         dpsoftrast.texture_firstfree = texnum + 1;
588         if (dpsoftrast.texture_max <= texnum)
589                 DPSOFTRAST_Texture_Grow();
590         if (dpsoftrast.texture_end <= texnum)
591                 dpsoftrast.texture_end = texnum + 1;
592         texture = &dpsoftrast.texture[texnum];
593         memset(texture, 0, sizeof(*texture));
594         texture->flags = flags;
595         texture->width = width;
596         texture->height = height;
597         texture->depth = depth;
598         texture->sides = sides;
599         texture->binds = 0;
600         w = width;
601         h = height;
602         d = depth;
603         size = 0;
604         mipmaps = 0;
605         w = width;
606         h = height;
607         d = depth;
608         for (;;)
609         {
610                 s = w * h * d * sides * 4;
611                 texture->mipmap[mipmaps][0] = size;
612                 texture->mipmap[mipmaps][1] = s;
613                 texture->mipmap[mipmaps][2] = w;
614                 texture->mipmap[mipmaps][3] = h;
615                 texture->mipmap[mipmaps][4] = d;
616                 size += s;
617                 mipmaps++;
618                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
619                         break;
620                 if (w > 1) w >>= 1;
621                 if (h > 1) h >>= 1;
622                 if (d > 1) d >>= 1;
623         }
624         texture->mipmaps = mipmaps;
625         texture->size = size;
626
627         // allocate the pixels now
628         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
629
630         return texnum;
631 }
632 void DPSOFTRAST_Texture_Free(int index)
633 {
634         DPSOFTRAST_Texture *texture;
635         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
636         if (texture->binds)
637                 DPSOFTRAST_Flush();
638         if (texture->bytes)
639                 MM_FREE(texture->bytes);
640         texture->bytes = NULL;
641         memset(texture, 0, sizeof(*texture));
642         // adjust the free range and used range
643         if (dpsoftrast.texture_firstfree > index)
644                 dpsoftrast.texture_firstfree = index;
645         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
646                 dpsoftrast.texture_end--;
647 }
648 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
649 {
650         int i, x, y, z, w, layer0, layer1, row0, row1;
651         unsigned char *o, *i0, *i1, *i2, *i3;
652         DPSOFTRAST_Texture *texture;
653         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
654         if (texture->mipmaps <= 1)
655                 return;
656         for (i = 1;i < texture->mipmaps;i++)
657         {
658                 for (z = 0;z < texture->mipmap[i][4];z++)
659                 {
660                         layer0 = z*2;
661                         layer1 = z*2+1;
662                         if (layer1 >= texture->mipmap[i-1][4])
663                                 layer1 = texture->mipmap[i-1][4]-1;
664                         for (y = 0;y < texture->mipmap[i][3];y++)
665                         {
666                                 row0 = y*2;
667                                 row1 = y*2+1;
668                                 if (row1 >= texture->mipmap[i-1][3])
669                                         row1 = texture->mipmap[i-1][3]-1;
670                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
671                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
672                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
673                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
674                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
675                                 w = texture->mipmap[i][2];
676                                 if (layer1 > layer0)
677                                 {
678                                         if (texture->mipmap[i-1][2] > 1)
679                                         {
680                                                 // average 3D texture
681                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
682                                                 {
683                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
684                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
685                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
686                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
687                                                 }
688                                         }
689                                         else
690                                         {
691                                                 // average 3D mipmap with parent width == 1
692                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
693                                                 {
694                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
695                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
696                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
697                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
698                                                 }
699                                         }
700                                 }
701                                 else
702                                 {
703                                         if (texture->mipmap[i-1][2] > 1)
704                                         {
705                                                 // average 2D texture (common case)
706                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
707                                                 {
708                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
709                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
710                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
711                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
712                                                 }
713                                         }
714                                         else
715                                         {
716                                                 // 2D texture with parent width == 1
717                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
718                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
719                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
720                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
721                                         }
722                                 }
723                         }
724                 }
725         }
726 }
727 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
728 {
729         DPSOFTRAST_Texture *texture;
730         unsigned char *dst;
731         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
732         if (texture->binds)
733                 DPSOFTRAST_Flush();
734         if (pixels)
735         {
736                 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
737                 while (blockheight > 0)
738                 {
739                         memcpy(dst, pixels, blockwidth * 4);
740                         pixels += blockwidth * 4;
741                         dst += texture->mipmap[0][2] * 4;
742                         blockheight--;
743                 }
744         }
745         DPSOFTRAST_Texture_CalculateMipmaps(index);
746 }
747 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
748 {
749         DPSOFTRAST_Texture *texture;
750         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
751         if (texture->binds)
752                 DPSOFTRAST_Flush();
753         if (pixels)
754                 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
755         DPSOFTRAST_Texture_CalculateMipmaps(index);
756 }
757 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
758 {
759         DPSOFTRAST_Texture *texture;
760         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
761         return texture->mipmap[mip][2];
762 }
763 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
764 {
765         DPSOFTRAST_Texture *texture;
766         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
767         return texture->mipmap[mip][3];
768 }
769 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
770 {
771         DPSOFTRAST_Texture *texture;
772         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
773         return texture->mipmap[mip][4];
774 }
775 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
776 {
777         DPSOFTRAST_Texture *texture;
778         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
779         if (texture->binds)
780                 DPSOFTRAST_Flush();
781         return texture->bytes + texture->mipmap[mip][0];
782 }
783 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
784 {
785         DPSOFTRAST_Texture *texture;
786         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
787         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
788         {
789                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
790                 return;
791         }
792         if (texture->binds)
793                 DPSOFTRAST_Flush();
794         texture->filter = filter;
795 }
796
797 static void DPSOFTRAST_Draw_FlushThreads(void);
798
799 static void DPSOFTRAST_Draw_SyncCommands(void)
800 {
801         if(dpsoftrast.usethreads) MEMORY_BARRIER;
802         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
803 }
804
805 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
806 {
807         DPSOFTRAST_State_Thread *thread;
808         int i;
809         int freecommand = dpsoftrast.commandpool.freecommand;
810         int usedcommands = dpsoftrast.commandpool.usedcommands;
811         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
812                 return;
813         DPSOFTRAST_Draw_SyncCommands();
814         for(;;)
815         {
816                 int waitindex = -1;
817                 int commandoffset;
818                 usedcommands = 0;
819                 for (i = 0; i < dpsoftrast.numthreads; i++)
820                 {
821                         thread = &dpsoftrast.threads[i]; 
822                         commandoffset = freecommand - thread->commandoffset;
823                         if (commandoffset < 0)
824                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
825                         if (commandoffset > usedcommands)
826                         {
827                                 waitindex = i;
828                                 usedcommands = commandoffset;
829                         }
830                 }
831                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
832                         break;
833                 thread = &dpsoftrast.threads[waitindex];
834                 Thread_LockMutex(thread->drawmutex);
835                 if (thread->commandoffset != dpsoftrast.drawcommand)
836                 {
837                         thread->waiting = true;
838                         if (thread->starving) Thread_CondSignal(thread->drawcond);
839                         Thread_CondWait(thread->waitcond, thread->drawmutex);
840                         thread->waiting = false;
841                 }
842                 Thread_UnlockMutex(thread->drawmutex);
843         }
844         dpsoftrast.commandpool.usedcommands = usedcommands;
845 }
846
847 #define DPSOFTRAST_ALIGNCOMMAND(size) \
848         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
849 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
850         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
851
852 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
853 {
854         DPSOFTRAST_Command *command;
855         int freecommand = dpsoftrast.commandpool.freecommand;
856         int usedcommands = dpsoftrast.commandpool.usedcommands;
857         int extra = sizeof(DPSOFTRAST_Command);
858         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
859                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
860         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
861         {
862                 if (dpsoftrast.usethreads)
863                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
864                 else
865                         DPSOFTRAST_Draw_FlushThreads();
866                 freecommand = dpsoftrast.commandpool.freecommand;
867                 usedcommands = dpsoftrast.commandpool.usedcommands;
868         }
869         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
870         {
871                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
872                 command->opcode = DPSOFTRAST_OPCODE_Reset;
873                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
874                 freecommand = 0;
875         }
876         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
877         command->opcode = opcode;
878         command->commandsize = size;
879         freecommand += size;
880         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
881                 freecommand = 0;
882         dpsoftrast.commandpool.freecommand = freecommand;
883         dpsoftrast.commandpool.usedcommands = usedcommands + size;
884         return command;
885 }
886
887 static void DPSOFTRAST_UndoCommand(int size)
888 {
889         int freecommand = dpsoftrast.commandpool.freecommand;
890         int usedcommands = dpsoftrast.commandpool.usedcommands;
891         freecommand -= size;
892         if (freecommand < 0)
893                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
894         usedcommands -= size;
895         dpsoftrast.commandpool.freecommand = freecommand;
896         dpsoftrast.commandpool.usedcommands = usedcommands;
897 }
898                 
899 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
900 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
901 {
902         thread->viewport[0] = command->x;
903         thread->viewport[1] = command->y;
904         thread->viewport[2] = command->width;
905         thread->viewport[3] = command->height;
906         thread->validate |= DPSOFTRAST_VALIDATE_FB;
907 }
908 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
909 {
910         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
911         command->x = x;
912         command->y = y;
913         command->width = width;
914         command->height = height;
915
916         dpsoftrast.viewport[0] = x;
917         dpsoftrast.viewport[1] = y;
918         dpsoftrast.viewport[2] = width;
919         dpsoftrast.viewport[3] = height;
920         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
921 }
922
923 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
924 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
925 {
926         int i, x1, y1, x2, y2, w, h, x, y;
927         int miny1, maxy1, miny2, maxy2;
928         int bandy;
929         unsigned int *p;
930         unsigned int c;
931         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
932         miny1 = thread->miny1;
933         maxy1 = thread->maxy1;
934         miny2 = thread->miny2;
935         maxy2 = thread->maxy2;
936         x1 = thread->fb_scissor[0];
937         y1 = thread->fb_scissor[1];
938         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
939         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
940         if (y1 < miny1) y1 = miny1;
941         if (y2 > maxy2) y2 = maxy2;
942         w = x2 - x1;
943         h = y2 - y1;
944         if (w < 1 || h < 1)
945                 return;
946         // FIXME: honor fb_colormask?
947         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
948         for (i = 0;i < 4;i++)
949         {
950                 if (!dpsoftrast.fb_colorpixels[i])
951                         continue;
952                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
953                 for (;y < bandy;y++)
954                 {
955                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
956                         for (x = x1;x < x2;x++)
957                                 p[x] = c;
958                 }
959         }
960 }
961 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
962 {
963         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
964         command->r = r;
965         command->g = g;
966         command->b = b;
967         command->a = a;
968 }
969
970 DEFCOMMAND(3, ClearDepth, float depth;)
971 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
972 {
973         int x1, y1, x2, y2, w, h, x, y;
974         int miny1, maxy1, miny2, maxy2;
975         int bandy;
976         unsigned int *p;
977         unsigned int c;
978         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
979         miny1 = thread->miny1;
980         maxy1 = thread->maxy1;
981         miny2 = thread->miny2;
982         maxy2 = thread->maxy2;
983         x1 = thread->fb_scissor[0];
984         y1 = thread->fb_scissor[1];
985         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
986         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
987         if (y1 < miny1) y1 = miny1;
988         if (y2 > maxy2) y2 = maxy2;
989         w = x2 - x1;
990         h = y2 - y1;
991         if (w < 1 || h < 1)
992                 return;
993         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
994         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
995         for (;y < bandy;y++)
996         {
997                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
998                 for (x = x1;x < x2;x++)
999                         p[x] = c;
1000         }
1001 }
1002 void DPSOFTRAST_ClearDepth(float d)
1003 {
1004         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1005         command->depth = d;
1006 }
1007
1008 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1009 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1010 {
1011         thread->colormask[0] = command->r != 0;
1012         thread->colormask[1] = command->g != 0;
1013         thread->colormask[2] = command->b != 0;
1014         thread->colormask[3] = command->a != 0;
1015         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1016 }
1017 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1018 {
1019         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1020         command->r = r;
1021         command->g = g;
1022         command->b = b;
1023         command->a = a;
1024 }
1025
1026 DEFCOMMAND(5, DepthTest, int enable;)
1027 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1028 {
1029         thread->depthtest = command->enable;
1030         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1031 }
1032 void DPSOFTRAST_DepthTest(int enable)
1033 {
1034         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1035         command->enable = enable;
1036 }
1037
1038 DEFCOMMAND(6, ScissorTest, int enable;)
1039 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1040 {
1041         thread->scissortest = command->enable;
1042         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1043 }
1044 void DPSOFTRAST_ScissorTest(int enable)
1045 {
1046         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1047         command->enable = enable;
1048 }
1049
1050 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1051 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1052 {
1053         thread->scissor[0] = command->x;
1054         thread->scissor[1] = command->y;
1055         thread->scissor[2] = command->width;
1056         thread->scissor[3] = command->height;
1057         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1058 }
1059 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1060 {
1061         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1062         command->x = x;
1063         command->y = y;
1064         command->width = width;
1065         command->height = height;
1066 }
1067
1068 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1069 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1070 {
1071         thread->blendfunc[0] = command->sfactor;
1072         thread->blendfunc[1] = command->dfactor;
1073         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1074 }
1075 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1076 {
1077         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1078         command->sfactor = sfactor;
1079         command->dfactor = dfactor;
1080 }
1081
1082 DEFCOMMAND(9, BlendSubtract, int enable;)
1083 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1084 {
1085         thread->blendsubtract = command->enable;
1086         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1087 }
1088 void DPSOFTRAST_BlendSubtract(int enable)
1089 {
1090         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1091         command->enable = enable;
1092 }
1093
1094 DEFCOMMAND(10, DepthMask, int enable;)
1095 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1096 {
1097         thread->depthmask = command->enable;
1098 }
1099 void DPSOFTRAST_DepthMask(int enable)
1100 {
1101         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1102         command->enable = enable;
1103 }
1104
1105 DEFCOMMAND(11, DepthFunc, int func;)
1106 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1107 {
1108         thread->depthfunc = command->func;
1109 }
1110 void DPSOFTRAST_DepthFunc(int func)
1111 {
1112         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1113         command->func = func;
1114 }
1115
1116 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1117 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1118 {
1119         thread->depthrange[0] = command->nearval;
1120         thread->depthrange[1] = command->farval;
1121 }
1122 void DPSOFTRAST_DepthRange(float nearval, float farval)
1123 {
1124         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1125         command->nearval = nearval;
1126         command->farval = farval;
1127 }
1128
1129 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1130 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1131 {
1132         thread->polygonoffset[0] = command->alongnormal;
1133         thread->polygonoffset[1] = command->intoview;
1134 }
1135 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1136 {
1137         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1138         command->alongnormal = alongnormal;
1139         command->intoview = intoview;
1140 }
1141
1142 DEFCOMMAND(14, CullFace, int mode;)
1143 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1144 {
1145         thread->cullface = command->mode;
1146 }
1147 void DPSOFTRAST_CullFace(int mode)
1148 {
1149         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1150         command->mode = mode;
1151 }
1152
1153 DEFCOMMAND(15, AlphaTest, int enable;)
1154 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1155 {
1156         thread->alphatest = command->enable;
1157 }
1158 void DPSOFTRAST_AlphaTest(int enable)
1159 {
1160         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1161         command->enable = enable;
1162 }
1163
1164 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1165 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1166 {
1167         thread->alphafunc = command->func;
1168         thread->alphavalue = command->ref;
1169 }
1170 void DPSOFTRAST_AlphaFunc(int func, float ref)
1171 {
1172         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1173         command->func = func;
1174         command->ref = ref;
1175 }
1176
1177 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1178 {
1179         dpsoftrast.color[0] = r;
1180         dpsoftrast.color[1] = g;
1181         dpsoftrast.color[2] = b;
1182         dpsoftrast.color[3] = a;
1183 }
1184
1185 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1186 {
1187         int outstride = blockwidth * 4;
1188         int instride = dpsoftrast.fb_width * 4;
1189         int bx1 = blockx;
1190         int by1 = blocky;
1191         int bx2 = blockx + blockwidth;
1192         int by2 = blocky + blockheight;
1193         int bw;
1194         int x;
1195         int y;
1196         unsigned char *inpixels;
1197         unsigned char *b;
1198         unsigned char *o;
1199         DPSOFTRAST_Flush();
1200         if (bx1 < 0) bx1 = 0;
1201         if (by1 < 0) by1 = 0;
1202         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1203         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1204         bw = bx2 - bx1;
1205         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1206         if (dpsoftrast.bigendian)
1207         {
1208                 for (y = by1;y < by2;y++)
1209                 {
1210                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1211                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1212                         for (x = bx1;x < bx2;x++)
1213                         {
1214                                 o[0] = b[3];
1215                                 o[1] = b[2];
1216                                 o[2] = b[1];
1217                                 o[3] = b[0];
1218                                 o += 4;
1219                                 b += 4;
1220                         }
1221                 }
1222         }
1223         else
1224         {
1225                 for (y = by1;y < by2;y++)
1226                 {
1227                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1228                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1229                         memcpy(o, b, bw*4);
1230                 }
1231         }
1232
1233 }
1234 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1235 {
1236         int tx1 = tx;
1237         int ty1 = ty;
1238         int tx2 = tx + width;
1239         int ty2 = ty + height;
1240         int sx1 = sx;
1241         int sy1 = sy;
1242         int sx2 = sx + width;
1243         int sy2 = sy + height;
1244         int swidth;
1245         int sheight;
1246         int twidth;
1247         int theight;
1248         int sw;
1249         int sh;
1250         int tw;
1251         int th;
1252         int y;
1253         unsigned int *spixels;
1254         unsigned int *tpixels;
1255         DPSOFTRAST_Texture *texture;
1256         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1257         if (mip < 0 || mip >= texture->mipmaps) return;
1258         DPSOFTRAST_Flush();
1259         spixels = dpsoftrast.fb_colorpixels[0];
1260         swidth = dpsoftrast.fb_width;
1261         sheight = dpsoftrast.fb_height;
1262         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1263         twidth = texture->mipmap[mip][2];
1264         theight = texture->mipmap[mip][3];
1265         if (tx1 < 0) tx1 = 0;
1266         if (ty1 < 0) ty1 = 0;
1267         if (tx2 > twidth) tx2 = twidth;
1268         if (ty2 > theight) ty2 = theight;
1269         if (sx1 < 0) sx1 = 0;
1270         if (sy1 < 0) sy1 = 0;
1271         if (sx2 > swidth) sx2 = swidth;
1272         if (sy2 > sheight) sy2 = sheight;
1273         tw = tx2 - tx1;
1274         th = ty2 - ty1;
1275         sw = sx2 - sx1;
1276         sh = sy2 - sy1;
1277         if (tw > sw) tw = sw;
1278         if (th > sh) th = sh;
1279         if (tw < 1 || th < 1)
1280                 return;
1281         sy1 = sheight - 1 - sy1;
1282         for (y = 0;y < th;y++)
1283                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1284         if (texture->mipmaps > 1)
1285                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1286 }
1287
1288 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1289 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1290 {
1291         if (thread->texbound[command->unitnum])
1292                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1293         thread->texbound[command->unitnum] = command->texture;
1294 }
1295 void DPSOFTRAST_SetTexture(int unitnum, int index)
1296 {
1297         DPSOFTRAST_Command_SetTexture *command;
1298         DPSOFTRAST_Texture *texture;
1299         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1300         {
1301                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1302                 return;
1303         }
1304         texture = DPSOFTRAST_Texture_GetByIndex(index);
1305         if (index && !texture)
1306         {
1307                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1308                 return;
1309         }
1310
1311         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1312         command->unitnum = unitnum;
1313         command->texture = texture;
1314
1315         dpsoftrast.texbound[unitnum] = texture;
1316         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1317 }
1318
1319 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1320 {
1321         dpsoftrast.pointer_vertex3f = vertex3f;
1322         dpsoftrast.stride_vertex = stride;
1323 }
1324 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1325 {
1326         dpsoftrast.pointer_color4f = color4f;
1327         dpsoftrast.pointer_color4ub = NULL;
1328         dpsoftrast.stride_color = stride;
1329 }
1330 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1331 {
1332         dpsoftrast.pointer_color4f = NULL;
1333         dpsoftrast.pointer_color4ub = color4ub;
1334         dpsoftrast.stride_color = stride;
1335 }
1336 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1337 {
1338         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1339         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1340         dpsoftrast.stride_texcoord[unitnum] = stride;
1341 }
1342
1343 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1344 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1345 {
1346         thread->shader_mode = command->mode;
1347         thread->shader_permutation = command->permutation;
1348         thread->shader_exactspecularmath = command->exactspecularmath;
1349 }
1350 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1351 {
1352         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1353         command->mode = mode;
1354         command->permutation = permutation;
1355         command->exactspecularmath = exactspecularmath;
1356
1357         dpsoftrast.shader_mode = mode;
1358         dpsoftrast.shader_permutation = permutation;
1359         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1360 }
1361
1362 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1363 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1364 {
1365         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1366 }
1367 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1368 {
1369         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1370         command->index = index;
1371         command->val[0] = v0;
1372         command->val[1] = v1;
1373         command->val[2] = v2;
1374         command->val[3] = v3;
1375
1376         dpsoftrast.uniform4f[index*4+0] = v0;
1377         dpsoftrast.uniform4f[index*4+1] = v1;
1378         dpsoftrast.uniform4f[index*4+2] = v2;
1379         dpsoftrast.uniform4f[index*4+3] = v3;
1380 }
1381 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1382 {
1383         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1384         command->index = index;
1385         memcpy(command->val, v, sizeof(command->val));
1386
1387         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1388 }
1389
1390 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1391 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1392 {
1393         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1394 }
1395 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1396 {
1397 #ifdef SSE_POSSIBLE
1398         int i, index;
1399         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1400         {
1401                 __m128 m0, m1, m2, m3;
1402                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1403                 command->index = (DPSOFTRAST_UNIFORM)index;
1404                 if (((size_t)v)&(ALIGN_SIZE-1))
1405                 {
1406                         m0 = _mm_loadu_ps(v);
1407                         m1 = _mm_loadu_ps(v+4);
1408                         m2 = _mm_loadu_ps(v+8);
1409                         m3 = _mm_loadu_ps(v+12);
1410                 }
1411                 else
1412                 {
1413                         m0 = _mm_load_ps(v);
1414                         m1 = _mm_load_ps(v+4);
1415                         m2 = _mm_load_ps(v+8);
1416                         m3 = _mm_load_ps(v+12);
1417                 }
1418                 if (transpose)
1419                 {
1420                         __m128 t0, t1, t2, t3;
1421                         t0 = _mm_unpacklo_ps(m0, m1);
1422                         t1 = _mm_unpacklo_ps(m2, m3);
1423                         t2 = _mm_unpackhi_ps(m0, m1);
1424                         t3 = _mm_unpackhi_ps(m2, m3);
1425                         m0 = _mm_movelh_ps(t0, t1);
1426                         m1 = _mm_movehl_ps(t1, t0);
1427                         m2 = _mm_movelh_ps(t2, t3);
1428                         m3 = _mm_movehl_ps(t3, t2);                     
1429                 }
1430                 _mm_store_ps(command->val, m0);
1431                 _mm_store_ps(command->val+4, m1);
1432                 _mm_store_ps(command->val+8, m2);
1433                 _mm_store_ps(command->val+12, m3);
1434                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1435                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1436                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1437                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1438         }
1439 #endif
1440 }
1441
1442 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1443 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1444 {
1445         thread->uniform1i[command->index] = command->val;
1446 }
1447 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1448 {
1449         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1450         command->index = index;
1451         command->val = i0;
1452
1453         dpsoftrast.uniform1i[command->index] = i0;
1454 }
1455
1456 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1457 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1458 {
1459         memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1460         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1461 }
1462 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1463 {
1464         DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1465         command->clipplane[0] = x;
1466         command->clipplane[1] = y;
1467         command->clipplane[2] = z;
1468         command->clipplane[3] = w;
1469 }
1470
1471 #ifdef SSE_POSSIBLE
1472 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1473 {
1474         float *end = dst + size*4;
1475         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1476         {
1477                 while (dst < end)
1478                 {
1479                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1480                         dst += 4;
1481                         src += stride;
1482                 }
1483         }
1484         else
1485         {
1486                 while (dst < end)
1487                 {
1488                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1489                         dst += 4;
1490                         src += stride;
1491                 }
1492         }
1493 }
1494
1495 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1496 {
1497         float *end = dst + size*4;
1498         if (stride == sizeof(float[3]))
1499         {
1500                 float *end4 = dst + (size&~3)*4;        
1501                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1502                 {
1503                         while (dst < end4)
1504                         {
1505                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1506                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1507                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1508                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1509                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1510                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1511                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1512                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1513                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1514                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1515                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1516                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1517                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1518                                 dst += 16;
1519                                 src += 4*sizeof(float[3]);
1520                         }
1521                 }
1522                 else
1523                 {
1524                         while (dst < end4)
1525                         {
1526                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1527                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1528                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1529                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1530                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1531                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1532                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1533                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1534                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1535                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1536                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1537                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1538                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1539                                 dst += 16;
1540                                 src += 4*sizeof(float[3]);
1541                         }
1542                 }
1543         }
1544         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1545         {
1546                 while (dst < end)
1547                 {
1548                         __m128 v = _mm_loadu_ps((const float *)src);
1549                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1550                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1551                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1552                         _mm_store_ps(dst, v);
1553                         dst += 4;
1554                         src += stride;
1555                 }
1556         }
1557         else
1558         {
1559                 while (dst < end)
1560                 {
1561                         __m128 v = _mm_load_ps((const float *)src);
1562                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1563                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1564                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1565                         _mm_store_ps(dst, v);
1566                         dst += 4;
1567                         src += stride;
1568                 }
1569         }
1570 }
1571
1572 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1573 {
1574         float *end = dst + size*4;
1575         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1576         if (stride == sizeof(float[2]))
1577         {
1578                 float *end2 = dst + (size&~1)*4;
1579                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1580                 {
1581                         while (dst < end2)
1582                         {
1583                                 __m128 v = _mm_loadu_ps((const float *)src);
1584                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1585                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1586                                 dst += 8;
1587                                 src += 2*sizeof(float[2]);
1588                         }
1589                 }
1590                 else
1591                 {
1592                         while (dst < end2)
1593                         {
1594                                 __m128 v = _mm_load_ps((const float *)src);
1595                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1596                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1597                                 dst += 8;
1598                                 src += 2*sizeof(float[2]);
1599                         }
1600                 }
1601         }
1602         while (dst < end)
1603         {
1604                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1605                 dst += 4;
1606                 src += stride;
1607         }
1608 }
1609
1610 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1611 {
1612         float *end = dst + size*4;
1613         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1614         if (stride == sizeof(unsigned char[4]))
1615         {
1616                 float *end4 = dst + (size&~3)*4;
1617                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1618                 {
1619                         while (dst < end4)
1620                         {
1621                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1622                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1623                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1624                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1625                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1626                                 dst += 16;
1627                                 src += 4*sizeof(unsigned char[4]);
1628                         }
1629                 }
1630                 else
1631                 {
1632                         while (dst < end4)
1633                         {
1634                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1635                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1636                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1637                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1638                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1639                                 dst += 16;
1640                                 src += 4*sizeof(unsigned char[4]);
1641                         }
1642                 }
1643         }
1644         while (dst < end)
1645         {
1646                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1647                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1648                 dst += 4;
1649                 src += stride;
1650         }
1651 }
1652
1653 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1654 {
1655         float *end = dst + 4*size;
1656         __m128 v = _mm_loadu_ps(src);
1657         while (dst < end)
1658         {
1659                 _mm_store_ps(dst, v);
1660                 dst += 4;
1661         }
1662 }
1663 #endif
1664
1665 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1666 {
1667 #ifdef SSE_POSSIBLE
1668         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1669         __m128 m0, m1, m2, m3;
1670         float *end;
1671         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1672         {
1673                 // fast case for identity matrix
1674                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1675                 return;
1676         }
1677         end = out4f + numitems*4;
1678         m0 = _mm_loadu_ps(inmatrix16f);
1679         m1 = _mm_loadu_ps(inmatrix16f + 4);
1680         m2 = _mm_loadu_ps(inmatrix16f + 8);
1681         m3 = _mm_loadu_ps(inmatrix16f + 12);
1682         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1683         {
1684                 while (out4f < end)
1685                 {
1686                         __m128 v = _mm_loadu_ps(in4f);
1687                         _mm_store_ps(out4f,
1688                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1689                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1690                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1691                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1692                         out4f += 4;
1693                         in4f += 4;
1694                 }
1695         }
1696         else
1697         {
1698                 while (out4f < end)
1699                 {
1700                         __m128 v = _mm_load_ps(in4f);
1701                         _mm_store_ps(out4f,
1702                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1703                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1704                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1705                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1706                         out4f += 4;
1707                         in4f += 4;
1708                 }
1709         }
1710 #endif
1711 }
1712
1713 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1714 {
1715         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1716 }
1717
1718 #ifdef SSE_POSSIBLE
1719 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1720 { \
1721         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1722         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1723         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1724         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1725 }
1726
1727 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1728 { \
1729         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1730         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1731         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1732         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1733 }
1734
1735 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1736 { \
1737         __m128 p = (in); \
1738         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1739                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1740                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1741                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1742 }
1743
1744 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1745 {
1746         int clipmask = 0xFF;
1747         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1748         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1749         __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1750         __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1751         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1752         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1753         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1754         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1755         #define BBFRONT(k, pos) \
1756         { \
1757                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1758                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1759                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1760                 { \
1761                         __m128 proj; \
1762                         clipmask &= ~(1<<k); \
1763                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1764                         minproj = _mm_min_ss(minproj, proj); \
1765                         maxproj = _mm_max_ss(maxproj, proj); \
1766                 } \
1767         }
1768         BBFRONT(0, minpos); 
1769         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1770         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1771         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1772         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1773         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1774         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1775         BBFRONT(7, maxpos);
1776         #define BBCLIP(k) \
1777         { \
1778                 if (clipmask&(1<<k)) \
1779                 { \
1780                         if (!(clipmask&(1<<(k^1)))) \
1781                         { \
1782                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1783                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1784                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1785                                 minproj = _mm_min_ss(minproj, proj); \
1786                                 maxproj = _mm_max_ss(maxproj, proj); \
1787                         } \
1788                         if (!(clipmask&(1<<(k^2)))) \
1789                         { \
1790                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1791                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1792                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1793                                 minproj = _mm_min_ss(minproj, proj); \
1794                                 maxproj = _mm_max_ss(maxproj, proj); \
1795                         } \
1796                         if (!(clipmask&(1<<(k^4)))) \
1797                         { \
1798                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1799                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1800                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1801                                 minproj = _mm_min_ss(minproj, proj); \
1802                                 maxproj = _mm_max_ss(maxproj, proj); \
1803                         } \
1804                 } \
1805         }
1806         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1807         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1808         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1809         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1810         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1811         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1812         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1813         *starty = _mm_cvttss_si32(maxproj);
1814         *endy = _mm_cvttss_si32(minproj)+1;
1815         return clipmask;
1816 }
1817         
1818 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1819 {
1820         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1821         float *end = out4f + numitems*4;
1822         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1823         __m128 minpos, maxpos;
1824         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1825         {
1826                 minpos = maxpos = _mm_loadu_ps(in4f);
1827                 while (out4f < end)
1828                 {
1829                         __m128 v = _mm_loadu_ps(in4f);
1830                         minpos = _mm_min_ps(minpos, v);
1831                         maxpos = _mm_max_ps(maxpos, v);
1832                         _mm_store_ps(out4f, v);
1833                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1834                         _mm_store_ps(screen4f, v);
1835                         in4f += 4;
1836                         out4f += 4;
1837                         screen4f += 4;
1838                 }
1839         }
1840         else
1841         {
1842                 minpos = maxpos = _mm_load_ps(in4f);
1843                 while (out4f < end)
1844                 {
1845                         __m128 v = _mm_load_ps(in4f);
1846                         minpos = _mm_min_ps(minpos, v);
1847                         maxpos = _mm_max_ps(maxpos, v);
1848                         _mm_store_ps(out4f, v);
1849                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1850                         _mm_store_ps(screen4f, v);
1851                         in4f += 4;
1852                         out4f += 4;
1853                         screen4f += 4;
1854                 }
1855         }
1856         if (starty && endy) 
1857         {
1858                 ALIGN(float minposf[4]);
1859                 ALIGN(float maxposf[4]);
1860                 _mm_store_ps(minposf, minpos);
1861                 _mm_store_ps(maxposf, maxpos);
1862                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1863         }
1864         return 0;
1865 }
1866
1867 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1868 {
1869         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1870         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1871         float *end;
1872         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1873                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1874         end = out4f + numitems*4;
1875         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1876         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1877         m0 = _mm_loadu_ps(inmatrix16f);
1878         m1 = _mm_loadu_ps(inmatrix16f + 4);
1879         m2 = _mm_loadu_ps(inmatrix16f + 8);
1880         m3 = _mm_loadu_ps(inmatrix16f + 12);
1881         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1882         {
1883                 minpos = maxpos = _mm_loadu_ps(in4f);
1884                 while (out4f < end)
1885                 {
1886                         __m128 v = _mm_loadu_ps(in4f);
1887                         minpos = _mm_min_ps(minpos, v);
1888                         maxpos = _mm_max_ps(maxpos, v);
1889                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1890                         _mm_store_ps(out4f, v);
1891                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1892                         _mm_store_ps(screen4f, v);
1893                         in4f += 4;
1894                         out4f += 4;
1895                         screen4f += 4;
1896                 }
1897         }
1898         else
1899         {
1900                 minpos = maxpos = _mm_load_ps(in4f);
1901                 while (out4f < end)
1902                 {
1903                         __m128 v = _mm_load_ps(in4f);
1904                         minpos = _mm_min_ps(minpos, v);
1905                         maxpos = _mm_max_ps(maxpos, v);
1906                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1907                         _mm_store_ps(out4f, v);
1908                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1909                         _mm_store_ps(screen4f, v);
1910                         in4f += 4;
1911                         out4f += 4;
1912                         screen4f += 4;
1913                 }
1914         }
1915         if (starty && endy) 
1916         {
1917                 ALIGN(float minposf[4]);
1918                 ALIGN(float maxposf[4]);
1919                 _mm_store_ps(minposf, minpos);
1920                 _mm_store_ps(maxposf, maxpos);
1921                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f); 
1922         }
1923         return 0;
1924 }
1925 #endif
1926
1927 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1928 {
1929 #ifdef SSE_POSSIBLE
1930         float *outf = dpsoftrast.post_array4f[outarray];
1931         const unsigned char *inb;
1932         int firstvertex = dpsoftrast.firstvertex;
1933         int numvertices = dpsoftrast.numvertices;
1934         int stride;
1935         switch(inarray)
1936         {
1937         case DPSOFTRAST_ARRAY_POSITION:
1938                 stride = dpsoftrast.stride_vertex;
1939                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1940                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1941                 break;
1942         case DPSOFTRAST_ARRAY_COLOR:
1943                 stride = dpsoftrast.stride_color;
1944                 if (dpsoftrast.pointer_color4f)
1945                 {
1946                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1947                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1948                 }
1949                 else if (dpsoftrast.pointer_color4ub)
1950                 {
1951                         stride = dpsoftrast.stride_color;
1952                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1953                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1954                 }
1955                 else
1956                 {
1957                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1958                 }
1959                 break;
1960         default:
1961                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1962                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1963                 {
1964                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1965                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1966                         {
1967                         case 2:
1968                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1969                                 break;
1970                         case 3:
1971                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1972                                 break;
1973                         case 4:
1974                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1975                                 break;
1976                         }
1977                 }
1978                 break;
1979         }
1980         return outf;
1981 #else
1982         return NULL;
1983 #endif
1984 }
1985
1986 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1987 {
1988         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1989         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1990         return data;
1991 }
1992
1993 #if 0
1994 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1995 {
1996 #ifdef SSE_POSSIBLE
1997         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1998         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1999         return data;
2000 #else
2001         return NULL;
2002 #endif
2003 }
2004 #endif
2005
2006 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2007 {
2008 #ifdef SSE_POSSIBLE
2009         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2010         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2011         return data;
2012 #else
2013         return NULL;
2014 #endif
2015 }
2016
2017 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2018 {
2019         int x;
2020         int startx = span->startx;
2021         int endx = span->endx;
2022         float wslope = triangle->w[0];
2023         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2024         float endz = 1.0f / (w + wslope * startx);
2025         if (triangle->w[0] == 0)
2026         {
2027                 // LordHavoc: fast flat polygons (HUD/menu)
2028                 for (x = startx;x < endx;x++)
2029                         zf[x] = endz;
2030                 return;
2031         }
2032         for (x = startx;x < endx;)
2033         {
2034                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2035                 float z = endz, dz;
2036                 if (nextsub >= endx) nextsub = endsub = endx-1;
2037                 endz = 1.0f / (w + wslope * nextsub);
2038                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2039                 for (; x <= endsub; x++, z += dz)
2040                         zf[x] = z;
2041         }
2042 }
2043
2044 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2045 {
2046 #ifdef SSE_POSSIBLE
2047         int x;
2048         int startx = span->startx;
2049         int endx = span->endx;
2050         int subx;
2051         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2052         unsigned char * RESTRICT pixelmask = span->pixelmask;
2053         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2054         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2055         if (!pixel)
2056                 return;
2057         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2058         pixeli += span->y * dpsoftrast.fb_width + span->x;
2059         // handle alphatest now (this affects depth writes too)
2060         if (thread->alphatest)
2061                 for (x = startx;x < endx;x++)
2062                         if (in4ub[x*4+3] < 128)
2063                                 pixelmask[x] = false;
2064         // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2065         // helps sprites, text and hud artwork
2066         switch(thread->fb_blendmode)
2067         {
2068         case DPSOFTRAST_BLENDMODE_ALPHA:
2069         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2070         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2071                 for (x = startx;x < endx;x++)
2072                         if (in4ub[x*4+3] < 1)
2073                                 pixelmask[x] = false;
2074                 break;
2075         case DPSOFTRAST_BLENDMODE_OPAQUE:
2076         case DPSOFTRAST_BLENDMODE_ADD:
2077         case DPSOFTRAST_BLENDMODE_INVMOD:
2078         case DPSOFTRAST_BLENDMODE_MUL:
2079         case DPSOFTRAST_BLENDMODE_MUL2:
2080         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2081         case DPSOFTRAST_BLENDMODE_INVADD:
2082                 break;
2083         }
2084         // put some special values at the end of the mask to ensure the loops end
2085         pixelmask[endx] = 1;
2086         pixelmask[endx+1] = 0;
2087         // LordHavoc: use a double loop to identify subspans, this helps the
2088         // optimized copy/blend loops to perform at their best, most triangles
2089         // have only one run of pixels, and do the search using wide reads...
2090         x = startx;
2091         while (x < endx)
2092         {
2093                 // if this pixel is masked off, it's probably not alone...
2094                 if (!pixelmask[x])
2095                 {
2096                         x++;
2097 #if 1
2098                         if (x + 8 < endx)
2099                         {
2100                                 // the 4-item search must be aligned or else it stalls badly
2101                                 if ((x & 3) && !pixelmask[x]) x++;
2102                                 if ((x & 3) && !pixelmask[x]) x++;
2103                                 if ((x & 3) && !pixelmask[x]) x++;
2104                                 while (*((unsigned int *)pixelmask + x) == 0x00000000)
2105                                         x += 4;
2106                         }
2107 #endif
2108                         for (;!pixelmask[x];x++)
2109                                 ;
2110                         // rather than continue the loop, just check the end variable
2111                         if (x >= endx)
2112                                 break;
2113                 }
2114                 // find length of subspan
2115                 subx = x + 1;
2116 #if 1
2117                 if (x + 8 < endx)
2118                 {
2119                         if ((subx & 3) && pixelmask[subx]) subx++;
2120                         if ((subx & 3) && pixelmask[subx]) subx++;
2121                         if ((subx & 3) && pixelmask[subx]) subx++;
2122                         while (*((unsigned int *)pixelmask + subx) == 0x01010101)
2123                                 subx += 4;
2124                 }
2125 #endif
2126                 for (;pixelmask[subx];subx++)
2127                         ;
2128                 // the checks can overshoot, so make sure to clip it...
2129                 if (subx > endx)
2130                         subx = endx;
2131                 // now that we know the subspan length...  process!
2132                 switch(thread->fb_blendmode)
2133                 {
2134                 case DPSOFTRAST_BLENDMODE_OPAQUE:
2135 #if 0
2136                         if (subx - x >= 16)
2137                         {
2138                                 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2139                                 x = subx;
2140                         }
2141                         else
2142 #elif 1
2143                         while (x + 16 <= subx)
2144                         {
2145                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2146                                 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2147                                 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2148                                 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2149                                 x += 16;
2150                         }
2151 #endif
2152                         {
2153                                 while (x + 4 <= subx)
2154                                 {
2155                                         _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2156                                         x += 4;
2157                                 }
2158                                 if (x + 2 <= subx)
2159                                 {
2160                                         pixeli[x] = ini[x];
2161                                         pixeli[x+1] = ini[x+1];
2162                                         x += 2;
2163                                 }
2164                                 if (x < subx)
2165                                 {
2166                                         pixeli[x] = ini[x];
2167                                         x++;
2168                                 }
2169                         }
2170                         break;
2171                 case DPSOFTRAST_BLENDMODE_ALPHA:
2172                 #define FINISHBLEND(blend2, blend1) \
2173                         for (;x + 1 < subx;x += 2) \
2174                         { \
2175                                 __m128i src, dst; \
2176                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2177                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2178                                 blend2; \
2179                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2180                         } \
2181                         if (x < subx) \
2182                         { \
2183                                 __m128i src, dst; \
2184                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2185                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2186                                 blend1; \
2187                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2188                                 x++; \
2189                         }
2190                         FINISHBLEND({
2191                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2192                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2193                         }, {
2194                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2195                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2196                         });
2197                         break;
2198                 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2199                         FINISHBLEND({
2200                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2201                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2202                         }, {
2203                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2204                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2205                         });
2206                         break;
2207                 case DPSOFTRAST_BLENDMODE_ADD:
2208                         FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2209                         break;
2210                 case DPSOFTRAST_BLENDMODE_INVMOD:
2211                         FINISHBLEND({
2212                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2213                         }, {
2214                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2215                         });
2216                         break;
2217                 case DPSOFTRAST_BLENDMODE_MUL:
2218                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2219                         break;
2220                 case DPSOFTRAST_BLENDMODE_MUL2:
2221                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2222                         break;
2223                 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2224                         FINISHBLEND({
2225                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2226                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2227                         }, {
2228                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2229                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2230                         });
2231                         break;
2232                 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2233                         FINISHBLEND({
2234                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2235                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2236                         }, {
2237                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2238                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2239                         });
2240                         break;
2241                 case DPSOFTRAST_BLENDMODE_INVADD:
2242                         FINISHBLEND({
2243                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2244                         }, {
2245                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2246                         });
2247                         break;
2248                 }
2249         }
2250 #endif
2251 }
2252
2253 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2254 {
2255         int x;
2256         int startx = span->startx;
2257         int endx = span->endx;
2258         int flags;
2259         float c[4];
2260         float data[4];
2261         float slope[4];
2262         float tc[2], endtc[2];
2263         float tcscale[2];
2264         unsigned int tci[2];
2265         unsigned int tci1[2];
2266         unsigned int tcimin[2];
2267         unsigned int tcimax[2];
2268         int tciwrapmask[2];
2269         int tciwidth;
2270         int filter;
2271         int mip;
2272         const unsigned char * RESTRICT pixelbase;
2273         const unsigned char * RESTRICT pixel[4];
2274         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2275         // if no texture is bound, just fill it with white
2276         if (!texture)
2277         {
2278                 for (x = startx;x < endx;x++)
2279                 {
2280                         out4f[x*4+0] = 1.0f;
2281                         out4f[x*4+1] = 1.0f;
2282                         out4f[x*4+2] = 1.0f;
2283                         out4f[x*4+3] = 1.0f;
2284                 }
2285                 return;
2286         }
2287         mip = triangle->mip[texunitindex];
2288         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2289         // if this mipmap of the texture is 1 pixel, just fill it with that color
2290         if (texture->mipmap[mip][1] == 4)
2291         {
2292                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2293                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2294                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2295                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2296                 for (x = startx;x < endx;x++)
2297                 {
2298                         out4f[x*4+0] = c[0];
2299                         out4f[x*4+1] = c[1];
2300                         out4f[x*4+2] = c[2];
2301                         out4f[x*4+3] = c[3];
2302                 }
2303                 return;
2304         }
2305         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2306         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2307         flags = texture->flags;
2308         tcscale[0] = texture->mipmap[mip][2];
2309         tcscale[1] = texture->mipmap[mip][3];
2310         tciwidth = texture->mipmap[mip][2];
2311         tcimin[0] = 0;
2312         tcimin[1] = 0;
2313         tcimax[0] = texture->mipmap[mip][2]-1;
2314         tcimax[1] = texture->mipmap[mip][3]-1;
2315         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2316         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2317         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2318         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2319         if (filter)
2320         {
2321                 endtc[0] -= 0.5f;
2322                 endtc[1] -= 0.5f;
2323         }
2324         for (x = startx;x < endx;)
2325         {
2326                 unsigned int subtc[2];
2327                 unsigned int substep[2];
2328                 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2329                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2330                 if (nextsub >= endx)
2331                 {
2332                         nextsub = endsub = endx-1;      
2333                         if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2334                 }
2335                 tc[0] = endtc[0];
2336                 tc[1] = endtc[1];
2337                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2338                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2339                 if (filter)
2340                 {
2341                         endtc[0] -= 0.5f;
2342                         endtc[1] -= 0.5f;
2343                 }
2344                 substep[0] = (endtc[0] - tc[0]) * subscale;
2345                 substep[1] = (endtc[1] - tc[1]) * subscale;
2346                 subtc[0] = tc[0] * (1<<12);
2347                 subtc[1] = tc[1] * (1<<12);
2348                 if (filter)
2349                 {
2350                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2351                         {
2352                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2353                                 {
2354                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2355                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2356                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2357                                         tci[0] = subtc[0]>>12;
2358                                         tci[1] = subtc[1]>>12;
2359                                         tci1[0] = tci[0] + 1;
2360                                         tci1[1] = tci[1] + 1;
2361                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2362                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2363                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2364                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2365                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2366                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2367                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2368                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2369                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2370                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2371                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2372                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2373                                         out4f[x*4+0] = c[0];
2374                                         out4f[x*4+1] = c[1];
2375                                         out4f[x*4+2] = c[2];
2376                                         out4f[x*4+3] = c[3];
2377                                 }
2378                         }
2379                         else
2380                         {
2381                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2382                                 {
2383                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2384                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2385                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2386                                         tci[0] = subtc[0]>>12;
2387                                         tci[1] = subtc[1]>>12;
2388                                         tci1[0] = tci[0] + 1;
2389                                         tci1[1] = tci[1] + 1;
2390                                         tci[0] &= tciwrapmask[0];
2391                                         tci[1] &= tciwrapmask[1];
2392                                         tci1[0] &= tciwrapmask[0];
2393                                         tci1[1] &= tciwrapmask[1];
2394                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2395                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2396                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2397                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2398                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2399                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2400                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2401                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2402                                         out4f[x*4+0] = c[0];
2403                                         out4f[x*4+1] = c[1];
2404                                         out4f[x*4+2] = c[2];
2405                                         out4f[x*4+3] = c[3];
2406                                 }
2407                         }
2408                 }
2409                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2410                 {
2411                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2412                         {
2413                                 tci[0] = subtc[0]>>12;
2414                                 tci[1] = subtc[1]>>12;
2415                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2416                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2417                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2418                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2419                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2420                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2421                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2422                                 out4f[x*4+0] = c[0];
2423                                 out4f[x*4+1] = c[1];
2424                                 out4f[x*4+2] = c[2];
2425                                 out4f[x*4+3] = c[3];
2426                         }
2427                 }
2428                 else
2429                 {
2430                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2431                         {
2432                                 tci[0] = subtc[0]>>12;
2433                                 tci[1] = subtc[1]>>12;
2434                                 tci[0] &= tciwrapmask[0];
2435                                 tci[1] &= tciwrapmask[1];
2436                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2437                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2438                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2439                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2440                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2441                                 out4f[x*4+0] = c[0];
2442                                 out4f[x*4+1] = c[1];
2443                                 out4f[x*4+2] = c[2];
2444                                 out4f[x*4+3] = c[3];
2445                         }
2446                 }
2447         }
2448 }
2449
2450 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2451 {
2452 #ifdef SSE_POSSIBLE
2453         int x;
2454         int startx = span->startx;
2455         int endx = span->endx;
2456         int flags;
2457         __m128 data, slope, tcscale;
2458         __m128i tcsize, tcmask, tcoffset, tcmax;
2459         __m128 tc, endtc;
2460         __m128i subtc, substep, endsubtc;
2461         int filter;
2462         int mip;
2463         int affine; // LordHavoc: optimized affine texturing case
2464         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2465         const unsigned char * RESTRICT pixelbase;
2466         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2467         // if no texture is bound, just fill it with white
2468         if (!texture)
2469         {
2470                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2471                 return;
2472         }
2473         mip = triangle->mip[texunitindex];
2474         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2475         // if this mipmap of the texture is 1 pixel, just fill it with that color
2476         if (texture->mipmap[mip][1] == 4)
2477         {
2478                 unsigned int k = *((const unsigned int *)pixelbase);
2479                 for (x = startx;x < endx;x++)
2480                         outi[x] = k;
2481                 return;
2482         }
2483         affine = zf[startx] == zf[endx-1];
2484         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2485         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2486         flags = texture->flags;
2487         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2488         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2489         tcscale = _mm_cvtepi32_ps(tcsize);
2490         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2491         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2492         endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2493         if (filter)
2494                 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2495         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2496         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2497         tcmax = _mm_packs_epi32(tcmask, tcmask);
2498         for (x = startx;x < endx;)
2499         {
2500                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2501                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2502                 if (nextsub >= endx || affine)
2503                 {
2504                         nextsub = endsub = endx-1;
2505                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2506                 }       
2507                 tc = endtc;
2508                 subtc = endsubtc;
2509                 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2510                 if (filter)
2511                         endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2512                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2513                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2514                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2515                 substep = _mm_slli_epi32(substep, 1);
2516                 if (filter)
2517                 {
2518                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2519                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2520                         {
2521                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2522                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2523                                 {
2524                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2525                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2526                                         tci = _mm_madd_epi16(tci, tcoffset);
2527                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2528                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2529                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2530                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2531                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2532                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2533                                         fracm = _mm_srli_epi16(subtc, 1);
2534                                         pix1 = _mm_add_epi16(pix1,
2535                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2536                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2537                                         pix3 = _mm_add_epi16(pix3,
2538                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2539                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2540                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2541                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2542                                         pix2 = _mm_add_epi16(pix2,
2543                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2544                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2545                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2546                                 }
2547                                 if (x <= endsub)
2548                                 {
2549                                         const unsigned char * RESTRICT ptr1;
2550                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2551                                         tci = _mm_madd_epi16(tci, tcoffset);
2552                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2553                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2554                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2555                                         fracm = _mm_srli_epi16(subtc, 1);
2556                                         pix1 = _mm_add_epi16(pix1,
2557                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2558                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2559                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2560                                         pix1 = _mm_add_epi16(pix1,
2561                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2562                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2563                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2564                                         x++;
2565                                 }
2566                         }
2567                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2568                         {
2569                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2570                                 {
2571                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2572                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2573                                         tci = _mm_madd_epi16(tci, tcoffset);
2574                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2575                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2576                                                                                         _mm_setzero_si128());
2577                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2578                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2579                                                                                         _mm_setzero_si128());
2580                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2581                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2582                                         tci = _mm_madd_epi16(tci, tcoffset);
2583                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2584                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2585                                                                                         _mm_setzero_si128());
2586                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2587                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2588                                                                                         _mm_setzero_si128());
2589                                         fracm = _mm_srli_epi16(subtc, 1);
2590                                         pix1 = _mm_add_epi16(pix1,
2591                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2592                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2593                                         pix3 = _mm_add_epi16(pix3,
2594                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2595                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2596                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2597                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2598                                         pix2 = _mm_add_epi16(pix2,
2599                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2600                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2601                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2602                                 }
2603                                 if (x <= endsub)
2604                                 {
2605                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2606                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2607                                         tci = _mm_madd_epi16(tci, tcoffset);
2608                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2609                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2610                                                                                         _mm_setzero_si128());
2611                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2612                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2613                                                                                         _mm_setzero_si128());
2614                                         fracm = _mm_srli_epi16(subtc, 1);
2615                                         pix1 = _mm_add_epi16(pix1,
2616                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2617                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2618                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2619                                         pix1 = _mm_add_epi16(pix1,
2620                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2621                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2622                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2623                                         x++;
2624                                 }
2625                         }
2626                         else
2627                         {
2628                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2629                                 {
2630                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2631                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2632                                         tci = _mm_madd_epi16(tci, tcoffset);
2633                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2634                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2635                                                                                         _mm_setzero_si128());
2636                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2637                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2638                                                                                         _mm_setzero_si128());
2639                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2640                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2641                                         tci = _mm_madd_epi16(tci, tcoffset);
2642                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2643                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2644                                                                                         _mm_setzero_si128());
2645                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2646                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2647                                                                                         _mm_setzero_si128());
2648                                         fracm = _mm_srli_epi16(subtc, 1);
2649                                         pix1 = _mm_add_epi16(pix1,
2650                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2651                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2652                                         pix3 = _mm_add_epi16(pix3,
2653                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2654                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2655                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2656                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2657                                         pix2 = _mm_add_epi16(pix2,
2658                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2659                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2660                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2661                                 }
2662                                 if (x <= endsub)
2663                                 {
2664                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2665                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2666                                         tci = _mm_madd_epi16(tci, tcoffset);
2667                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2668                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2669                                                                                         _mm_setzero_si128());
2670                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2671                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2672                                                                                         _mm_setzero_si128());
2673                                         fracm = _mm_srli_epi16(subtc, 1);
2674                                         pix1 = _mm_add_epi16(pix1,
2675                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2676                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2677                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2678                                         pix1 = _mm_add_epi16(pix1,
2679                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2680                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2681                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2682                                         x++;
2683                                 }
2684                         }
2685                 }
2686                 else
2687                 {
2688                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2689                         {
2690                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2691                                 {
2692                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2693                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2694                                         tci = _mm_madd_epi16(tci, tcoffset);
2695                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2696                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2697                                 }
2698                                 if (x <= endsub)
2699                                 {
2700                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2701                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2702                                         tci = _mm_madd_epi16(tci, tcoffset);
2703                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2704                                         x++;
2705                                 }
2706                         }
2707                         else
2708                         {
2709                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2710                                 {
2711                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2712                                         tci = _mm_and_si128(tci, tcmax); 
2713                                         tci = _mm_madd_epi16(tci, tcoffset);
2714                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2715                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2716                                 }
2717                                 if (x <= endsub)
2718                                 {
2719                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2720                                         tci = _mm_and_si128(tci, tcmax); 
2721                                         tci = _mm_madd_epi16(tci, tcoffset);
2722                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2723                                         x++;
2724                                 }
2725                         }
2726                 }
2727         }
2728 #endif
2729 }
2730
2731 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2732 {
2733         // TODO: IMPLEMENT
2734         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2735 }
2736
2737 float DPSOFTRAST_SampleShadowmap(const float *vector)
2738 {
2739         // TODO: IMPLEMENT
2740         return 1.0f;
2741 }
2742
2743 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2744 {
2745         int x;
2746         int startx = span->startx;
2747         int endx = span->endx;
2748         float c[4];
2749         float data[4];
2750         float slope[4];
2751         float z;
2752         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2753         for (x = startx;x < endx;x++)
2754         {
2755                 z = zf[x];
2756                 c[0] = (data[0] + slope[0]*x) * z;
2757                 c[1] = (data[1] + slope[1]*x) * z;
2758                 c[2] = (data[2] + slope[2]*x) * z;
2759                 c[3] = (data[3] + slope[3]*x) * z;
2760                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2761                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2762                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2763                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2764         }
2765 }
2766
2767 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2768 {
2769         int x;
2770         int startx = span->startx;
2771         int endx = span->endx;
2772         float c[4];
2773         float data[4];
2774         float slope[4];
2775         float z;
2776         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2777         for (x = startx;x < endx;x++)
2778         {
2779                 z = zf[x];
2780                 c[0] = (data[0] + slope[0]*x) * z;
2781                 c[1] = (data[1] + slope[1]*x) * z;
2782                 c[2] = (data[2] + slope[2]*x) * z;
2783                 c[3] = (data[3] + slope[3]*x) * z;
2784                 out4f[x*4+0] = c[0];
2785                 out4f[x*4+1] = c[1];
2786                 out4f[x*4+2] = c[2];
2787                 out4f[x*4+3] = c[3];
2788         }
2789 }
2790
2791 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2792 {
2793         int x, startx = span->startx, endx = span->endx;
2794         float c[4], localcolor[4];
2795         localcolor[0] = subcolor[0];
2796         localcolor[1] = subcolor[1];
2797         localcolor[2] = subcolor[2];
2798         localcolor[3] = subcolor[3];
2799         for (x = startx;x < endx;x++)
2800         {
2801                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2802                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2803                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2804                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2805                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2806                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2807                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2808                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2809         }
2810 }
2811
2812 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2813 {
2814         int x, startx = span->startx, endx = span->endx;
2815         for (x = startx;x < endx;x++)
2816         {
2817                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2818                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2819                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2820                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2821         }
2822 }
2823
2824 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2825 {
2826         int x, startx = span->startx, endx = span->endx;
2827         for (x = startx;x < endx;x++)
2828         {
2829                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2830                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2831                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2832                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2833         }
2834 }
2835
2836 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2837 {
2838         int x, startx = span->startx, endx = span->endx;
2839         float a, b;
2840         for (x = startx;x < endx;x++)
2841         {
2842                 a = 1.0f - inb4f[x*4+3];
2843                 b = inb4f[x*4+3];
2844                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2845                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2846                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2847                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2848         }
2849 }
2850
2851 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2852 {
2853         int x, startx = span->startx, endx = span->endx;
2854         float localcolor[4], ilerp, lerp;
2855         localcolor[0] = color[0];
2856         localcolor[1] = color[1];
2857         localcolor[2] = color[2];
2858         localcolor[3] = color[3];
2859         ilerp = 1.0f - localcolor[3];
2860         lerp = localcolor[3];
2861         for (x = startx;x < endx;x++)
2862         {
2863                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2864                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2865                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2866                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2867         }
2868 }
2869
2870
2871
2872 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2873 {
2874 #ifdef SSE_POSSIBLE
2875         int x;
2876         int startx = span->startx;
2877         int endx = span->endx;
2878         __m128 data, slope;
2879         __m128 mod, endmod;
2880         __m128i submod, substep, endsubmod;
2881         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2882         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2883         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2884         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2885         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2886         for (x = startx; x < endx;)
2887         {
2888                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2889                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2890                 if (nextsub >= endx)
2891                 {
2892                         nextsub = endsub = endx-1;
2893                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2894                 }
2895                 mod = endmod;
2896                 submod = endsubmod;
2897                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2898                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2899                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2900                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2901                 substep = _mm_packs_epi32(substep, substep);
2902                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2903                 {
2904                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2905                         pix = _mm_mulhi_epu16(pix, submod);
2906                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2907                 }
2908                 if (x <= endsub)
2909                 {
2910                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2911                         pix = _mm_mulhi_epu16(pix, submod);
2912                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2913                         x++;
2914                 }
2915         }
2916 #endif
2917 }
2918
2919 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2920 {
2921 #ifdef SSE_POSSIBLE
2922         int x;
2923         int startx = span->startx;
2924         int endx = span->endx;
2925         __m128 data, slope;
2926         __m128 mod, endmod;
2927         __m128i submod, substep, endsubmod;
2928         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2929         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2930         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2931         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2932         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2933         for (x = startx; x < endx;)
2934         {
2935                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2936                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2937                 if (nextsub >= endx)
2938                 {
2939                         nextsub = endsub = endx-1;
2940                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2941                 }
2942                 mod = endmod;
2943                 submod = endsubmod;
2944                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2945                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2946                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2947                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2948                 substep = _mm_packs_epi32(substep, substep);
2949                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2950                 {
2951                         __m128i pix = _mm_srai_epi16(submod, 4);
2952                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2953                 }
2954                 if (x <= endsub)
2955                 {
2956                         __m128i pix = _mm_srai_epi16(submod, 4);
2957                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2958                         x++;
2959                 }
2960         }
2961 #endif
2962 }
2963
2964 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2965 {
2966 #ifdef SSE_POSSIBLE
2967         int x, startx = span->startx, endx = span->endx;
2968         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2969         localcolor = _mm_packs_epi32(localcolor, localcolor);
2970         for (x = startx;x+2 <= endx;x+=2)
2971         {
2972                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2973                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2974                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
2975                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2976         }
2977         if (x < endx)
2978         {
2979                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2980                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2981                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
2982                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2983         }
2984 #endif
2985 }
2986
2987 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2988 {
2989 #ifdef SSE_POSSIBLE
2990         int x, startx = span->startx, endx = span->endx;
2991         for (x = startx;x+2 <= endx;x+=2)
2992         {
2993                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2994                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
2995                 pix1 = _mm_mulhi_epu16(pix1, pix2);
2996                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2997         }
2998         if (x < endx)
2999         {
3000                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3001                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3002                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3003                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3004         }
3005 #endif
3006 }
3007
3008 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3009 {
3010 #ifdef SSE_POSSIBLE
3011         int x, startx = span->startx, endx = span->endx;
3012         for (x = startx;x+2 <= endx;x+=2)
3013         {
3014                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3015                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3016                 pix1 = _mm_add_epi16(pix1, pix2);
3017                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3018         }
3019         if (x < endx)
3020         {
3021                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3022                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3023                 pix1 = _mm_add_epi16(pix1, pix2);
3024                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3025         }
3026 #endif
3027 }
3028
3029 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3030 {
3031 #ifdef SSE_POSSIBLE
3032         int x, startx = span->startx, endx = span->endx;
3033         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3034         tint = _mm_packs_epi32(tint, tint);
3035         for (x = startx;x+2 <= endx;x+=2)
3036         {
3037                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3038                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3039                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3040                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3041         }
3042         if (x < endx)
3043         {
3044                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3045                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3046                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3047                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3048         }
3049 #endif
3050 }
3051
3052 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3053 {
3054 #ifdef SSE_POSSIBLE
3055         int x, startx = span->startx, endx = span->endx;
3056         for (x = startx;x+2 <= endx;x+=2)
3057         {
3058                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3059                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3060                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3061                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3062                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3063         }
3064         if (x < endx)
3065         {
3066                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3067                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3068                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3069                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3070                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3071         }
3072 #endif
3073 }
3074
3075 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3076 {
3077 #ifdef SSE_POSSIBLE
3078         int x, startx = span->startx, endx = span->endx;
3079         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3080         localcolor = _mm_packs_epi32(localcolor, localcolor);
3081         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3082         for (x = startx;x+2 <= endx;x+=2)
3083         {
3084                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3085                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3086                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3087         }
3088         if (x < endx)
3089         {
3090                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3091                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3092                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3093         }
3094 #endif
3095 }
3096
3097
3098
3099 void DPSOFTRAST_VertexShader_Generic(void)
3100 {
3101         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3102         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3103         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3104         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3105                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3106 }
3107
3108 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3109 {
3110         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3111         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3112         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3113         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3114         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3115         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3116         {
3117                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3118                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3119                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3120                 {
3121                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3122                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3123                         {
3124                                 // multiply
3125                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3126                         }
3127                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3128                         {
3129                                 // add
3130                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3131                         }
3132                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3133                         {
3134                                 // alphablend
3135                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3136                         }
3137                 }
3138         }
3139         else
3140                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3141         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3142 }
3143
3144
3145
3146 void DPSOFTRAST_VertexShader_PostProcess(void)
3147 {
3148         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3149         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3150         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3151 }
3152
3153 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3154 {
3155         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3156         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3157         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3158         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3159         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3160         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3161         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3162         {
3163                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3164                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3165         }
3166         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3167         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3168         {
3169                 // TODO: implement saturation
3170         }
3171         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3172         {
3173                 // TODO: implement gammaramps
3174         }
3175         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3176 }
3177
3178
3179
3180 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3181 {
3182         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3183 }
3184
3185 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3186 {
3187         // this is never called (because colormask is off when this shader is used)
3188         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3189         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3190         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3191         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3192         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3193 }
3194
3195
3196
3197 void DPSOFTRAST_VertexShader_FlatColor(void)
3198 {
3199         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3200         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3201 }
3202
3203 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3204 {
3205 #ifdef SSE_POSSIBLE
3206         unsigned char * RESTRICT pixelmask = span->pixelmask;
3207         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3208         int x, startx = span->startx, endx = span->endx;
3209         __m128i Color_Ambientm;
3210         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3211         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3212         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3213         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3214         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3215         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3216                 pixel = buffer_FragColorbgra8;
3217         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3218         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3219         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3220         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3221         for (x = startx;x < endx;x++)
3222         {
3223                 __m128i color, pix;
3224                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3225                 {
3226                         __m128i pix2;
3227                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3228                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3229                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3230                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3231                         x += 3;
3232                         continue;
3233                 }
3234                 if (!pixelmask[x])
3235                         continue;
3236                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3237                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3238                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3239         }
3240         if (pixel == buffer_FragColorbgra8)
3241                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3242 #endif
3243 }
3244
3245
3246
3247 void DPSOFTRAST_VertexShader_VertexColor(void)
3248 {
3249         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3250         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3251         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3252 }
3253
3254 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3255 {
3256 #ifdef SSE_POSSIBLE
3257         unsigned char * RESTRICT pixelmask = span->pixelmask;
3258         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3259         int x, startx = span->startx, endx = span->endx;
3260         __m128i Color_Ambientm, Color_Diffusem;
3261         __m128 data, slope;
3262         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3263         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3264         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3265         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3266         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3267         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3268         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3269                 pixel = buffer_FragColorbgra8;
3270         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3271         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3272         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3273         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3274         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3275         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3276         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3277         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3278         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3279         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3280         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3281         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3282         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3283         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3284         {
3285                 __m128i color, mod, pix;
3286                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3287                 {
3288                         __m128i pix2, mod2;
3289                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3290                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3291                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3292                         data = _mm_add_ps(data, slope);
3293                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3294                         data = _mm_add_ps(data, slope);
3295                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3296                         data = _mm_add_ps(data, slope);
3297                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3298                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3299                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3300                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3301                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3302                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3303                         x += 3;
3304                         continue;
3305                 }
3306                 if (!pixelmask[x])
3307                         continue;
3308                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3309                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3310                 mod = _mm_packs_epi32(mod, mod);
3311                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3312                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3313         }
3314         if (pixel == buffer_FragColorbgra8)
3315                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3316 #endif
3317 }
3318
3319
3320
3321 void DPSOFTRAST_VertexShader_Lightmap(void)
3322 {
3323         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3324         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3325         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3326 }
3327
3328 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3329 {
3330 #ifdef SSE_POSSIBLE
3331         unsigned char * RESTRICT pixelmask = span->pixelmask;
3332         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3333         int x, startx = span->startx, endx = span->endx;
3334         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3335         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3336         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3337         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3338         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3339         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3340         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3341         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3342         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3343         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3344                 pixel = buffer_FragColorbgra8;
3345         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3346         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3347         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3348         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3349         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3350         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3351         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3352         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3353         {
3354                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3355                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3356                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3357                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3358                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3359                 for (x = startx;x < endx;x++)
3360                 {
3361                         __m128i color, lightmap, glow, pix;
3362                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3363                         {
3364                                 __m128i pix2;
3365                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3366                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3367                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3368                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3369                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3370                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3371                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3372                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3373                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3374                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3375                                 x += 3;
3376                                 continue;
3377                         }
3378                         if (!pixelmask[x])
3379                                 continue;
3380                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3381                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3382                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3383                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3384                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3385                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3386                 }
3387         }
3388         else
3389         {
3390                 for (x = startx;x < endx;x++)
3391                 {
3392                         __m128i color, lightmap, pix;
3393                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3394                         {
3395                                 __m128i pix2;
3396                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3397                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3398                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3399                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3400                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3401                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3402                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3403                                 x += 3;
3404                                 continue;
3405                         }
3406                         if (!pixelmask[x]) 
3407                                 continue;
3408                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3409                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3410                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3411                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3412                 }
3413         }
3414         if (pixel == buffer_FragColorbgra8)
3415                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3416 #endif
3417 }
3418
3419
3420 void DPSOFTRAST_VertexShader_LightDirection(void);
3421 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3422
3423 void DPSOFTRAST_VertexShader_FakeLight(void)
3424 {
3425         DPSOFTRAST_VertexShader_LightDirection();
3426 }
3427
3428 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3429 {
3430         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3431 }
3432
3433
3434
3435 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3436 {
3437         DPSOFTRAST_VertexShader_LightDirection();
3438         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3439 }
3440
3441 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3442 {
3443         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3444 }
3445
3446
3447
3448 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3449 {
3450         DPSOFTRAST_VertexShader_LightDirection();
3451         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3452 }
3453
3454 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3455 {
3456         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3457 }
3458
3459
3460
3461 void DPSOFTRAST_VertexShader_LightDirection(void)
3462 {
3463         int i;
3464         int numvertices = dpsoftrast.numvertices;
3465         float LightDir[4];
3466         float LightVector[4];
3467         float EyePosition[4];
3468         float EyeVectorModelSpace[4];
3469         float EyeVector[4];
3470         float position[4];
3471         float svector[4];
3472         float tvector[4];
3473         float normal[4];
3474         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3475         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3476         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3477         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3478         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3479         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3480         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3481         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3482         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3483         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3484         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3485         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3486         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3487         for (i = 0;i < numvertices;i++)
3488         {
3489                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3490                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3491                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3492                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3493                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3494                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3495                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3496                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3497                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3498                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3499                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3500                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3501                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3502                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3503                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3504                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3505                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3506                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3507                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3508                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3509                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3510                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3511                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3512                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3513                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3514                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3515                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3516                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3517                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3518         }
3519         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3520 }
3521
3522 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3523 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3524 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3525 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3526 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3527 #define DPSOFTRAST_Vector3Normalize(v)\
3528 do\
3529 {\
3530         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3531         if (len)\
3532         {\
3533                 len = 1.0f / len;\
3534                 v[0] *= len;\
3535                 v[1] *= len;\
3536                 v[2] *= len;\
3537         }\
3538 }\
3539 while(0)
3540
3541 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3542 {
3543         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3544         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3545         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3546         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3547         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3548         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3549         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3550         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3551         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3552         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3553         int x, startx = span->startx, endx = span->endx;
3554         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3555         float LightVectordata[4];
3556         float LightVectorslope[4];
3557         float EyeVectordata[4];
3558         float EyeVectorslope[4];
3559         float VectorSdata[4];
3560         float VectorSslope[4];
3561         float VectorTdata[4];
3562         float VectorTslope[4];
3563         float VectorRdata[4];
3564         float VectorRslope[4];
3565         float z;
3566         float diffusetex[4];
3567         float glosstex[4];
3568         float surfacenormal[4];
3569         float lightnormal[4];
3570         float lightnormal_modelspace[4];
3571         float eyenormal[4];
3572         float specularnormal[4];
3573         float diffuse;
3574         float specular;
3575         float SpecularPower;
3576         int d[4];
3577         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3578         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3579         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3580         Color_Glow[3] = 0.0f;
3581         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3582         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3583         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3584         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3585         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3586         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3587         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3588         Color_Pants[3] = 0.0f;
3589         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3590         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3591         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3592         Color_Shirt[3] = 0.0f;
3593         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3594         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3595         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3596         {
3597                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3598                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3599         }
3600         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3601         {
3602                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3603         }
3604         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3605         {
3606                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3607                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3608                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3609                 Color_Diffuse[3] = 0.0f;
3610                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3611                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3612                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3613                 LightColor[3] = 0.0f;
3614                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3615                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3616                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3617                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3618                 Color_Specular[3] = 0.0f;
3619                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3620                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3621                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3622
3623                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3624                 {
3625                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3626                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3627                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3628                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3629                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3630                 }
3631                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3632                 {
3633                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3634                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3635                 }
3636                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3637                 {
3638                         // nothing of this needed
3639                 }
3640                 else
3641                 {
3642                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3643                 }
3644
3645                 for (x = startx;x < endx;x++)
3646                 {
3647                         z = buffer_z[x];
3648                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3649                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3650                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3651                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3652                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3653                         {
3654                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3655                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3656                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3657                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3658                         }
3659                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3660                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3661                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3662                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3663                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3664                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3665                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3666                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3667
3668                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3669                         {
3670                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3671                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3672                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3673                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3674
3675                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3676                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3677                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3678                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3679
3680                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3681                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3682                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3683                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3684
3685                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3686                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3687                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3688                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3689
3690                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3691                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3692
3693                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3694                                 {
3695                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3696                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3697                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3698                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3699                                 }
3700                         }
3701                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3702                         {
3703                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3704                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3705                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3706                                 {
3707                                         float f = 1.0f / 256.0f;
3708                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3709                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3710                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3711                                 }
3712                         }
3713                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3714                         {
3715                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3716                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3717                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3718                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3719
3720                                 LightColor[0] = 1.0;
3721                                 LightColor[1] = 1.0;
3722                                 LightColor[2] = 1.0;
3723                         }
3724                         else
3725                         {
3726                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3727                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3728                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3729                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3730                         }
3731
3732                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3733
3734                         if(thread->shader_exactspecularmath)
3735                         {
3736                                 // reflect lightnormal at surfacenormal, take the negative of that
3737                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3738                                 float f;
3739                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3740                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3741                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3742                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3743
3744                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
3745                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3746                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3747                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3748                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3749
3750                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3751                         }
3752                         else
3753                         {
3754                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3755                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3756                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3757                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3758
3759                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
3760                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
3761                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
3762                                 DPSOFTRAST_Vector3Normalize(specularnormal);
3763
3764                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3765                         }
3766
3767                         specular = pow(specular, SpecularPower * glosstex[3]);
3768                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3769                         {
3770                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3771                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3772                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3773                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3774                         }
3775                         else
3776                         {
3777                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3778                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3779                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3780                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3781                         }
3782
3783                         buffer_FragColorbgra8[x*4+0] = d[0];
3784                         buffer_FragColorbgra8[x*4+1] = d[1];
3785                         buffer_FragColorbgra8[x*4+2] = d[2];
3786                         buffer_FragColorbgra8[x*4+3] = d[3];
3787                 }
3788         }
3789         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3790         {
3791                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3792                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3793                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3794                 Color_Diffuse[3] = 0.0f;
3795                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3796                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3797                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3798                 LightColor[3] = 0.0f;
3799                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3800
3801                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3802                 {
3803                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3804                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3805                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3806                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3807                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3808                 }
3809                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3810                 {
3811                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3812                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3813                 }
3814                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3815                 {
3816                         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3817                 }
3818                 else
3819                 {
3820                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3821                 }
3822
3823                 for (x = startx;x < endx;x++)
3824                 {
3825                         z = buffer_z[x];
3826                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3827                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3828                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3829                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3830                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3831                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3832                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3833                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3834
3835                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3836                         {
3837                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3838                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3839                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3840                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3841
3842                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3843                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3844                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3845                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3846
3847                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3848                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3849                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3850                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3851
3852                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3853                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3854                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3855                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3856
3857                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3858                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3859
3860                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3861                                 {
3862                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3863                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3864                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3865                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3866                                 }
3867                         }
3868                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3869                         {
3870                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3871                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3872                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3873                                 {
3874                                         float f = 1.0f / 256.0f;
3875                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3876                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3877                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3878                                 }
3879                         }
3880                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3881                         {
3882                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3883                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3884                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3885                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3886
3887                                 LightColor[0] = 1.0;
3888                                 LightColor[1] = 1.0;
3889                                 LightColor[2] = 1.0;
3890                         }
3891                         else
3892                         {
3893                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3894                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3895                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3896                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3897                         }
3898
3899                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3900                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3901                         {
3902                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3903                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3904                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3905                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3906                         }
3907                         else
3908                         {
3909                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3910                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3911                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3912                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3913                         }
3914                         buffer_FragColorbgra8[x*4+0] = d[0];
3915                         buffer_FragColorbgra8[x*4+1] = d[1];
3916                         buffer_FragColorbgra8[x*4+2] = d[2];
3917                         buffer_FragColorbgra8[x*4+3] = d[3];
3918                 }
3919         }
3920         else
3921         {
3922                 for (x = startx;x < endx;x++)
3923                 {
3924                         z = buffer_z[x];
3925                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3926                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3927                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3928                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3929
3930                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3931                         {
3932                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3933                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3934                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3935                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3936                         }
3937                         else
3938                         {
3939                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3940                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3941                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3942                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3943                         }
3944                         buffer_FragColorbgra8[x*4+0] = d[0];
3945                         buffer_FragColorbgra8[x*4+1] = d[1];
3946                         buffer_FragColorbgra8[x*4+2] = d[2];
3947                         buffer_FragColorbgra8[x*4+3] = d[3];
3948                 }
3949         }
3950         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3951 }
3952
3953
3954
3955 void DPSOFTRAST_VertexShader_LightSource(void)
3956 {
3957         int i;
3958         int numvertices = dpsoftrast.numvertices;
3959         float LightPosition[4];
3960         float LightVector[4];
3961         float LightVectorModelSpace[4];
3962         float EyePosition[4];
3963         float EyeVectorModelSpace[4];
3964         float EyeVector[4];
3965         float position[4];
3966         float svector[4];
3967         float tvector[4];
3968         float normal[4];
3969         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3970         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3971         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3972         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3973         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3974         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3975         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3976         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3977         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3978         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3979         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3980         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3981         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3982         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3983         for (i = 0;i < numvertices;i++)
3984         {
3985                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3986                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3987                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3988                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3989                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3990                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3991                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3992                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3993                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3994                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3995                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3996                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3997                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3998                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3999                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4000                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4001                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4002                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
4003                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4004                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4005                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4006                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4007                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4008                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4009                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4010                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4011                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4012                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4013                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4014                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4015                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4016                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4017         }
4018         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4019         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4020 }
4021
4022 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4023 {
4024 #ifdef SSE_POSSIBLE
4025         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4026         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4027         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4028         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4029         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4030         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4031         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4032         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4033         int x, startx = span->startx, endx = span->endx;
4034         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4035         float CubeVectordata[4];
4036         float CubeVectorslope[4];
4037         float LightVectordata[4];
4038         float LightVectorslope[4];
4039         float EyeVectordata[4];
4040         float EyeVectorslope[4];
4041         float z;
4042         float diffusetex[4];
4043         float glosstex[4];
4044         float surfacenormal[4];
4045         float lightnormal[4];
4046         float eyenormal[4];
4047         float specularnormal[4];
4048         float diffuse;
4049         float specular;
4050         float SpecularPower;
4051         float CubeVector[4];
4052         float attenuation;
4053         int d[4];
4054         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4055         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4056         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4057         Color_Glow[3] = 0.0f;
4058         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4059         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4060         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4061         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4062         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4063         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4064         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4065         Color_Diffuse[3] = 0.0f;
4066         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4067         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4068         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4069         Color_Specular[3] = 0.0f;
4070         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4071         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4072         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4073         Color_Pants[3] = 0.0f;
4074         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4075         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4076         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4077         Color_Shirt[3] = 0.0f;
4078         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4079         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4080         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4081         LightColor[3] = 0.0f;
4082         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4083         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4084         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4085         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4086         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4087         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4088         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4089         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4090         {
4091                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4092                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4093         }
4094         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4095                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4096         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4097         {
4098                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4099                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4100                 for (x = startx;x < endx;x++)
4101                 {
4102                         z = buffer_z[x];
4103                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4104                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4105                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4106                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4107                         if (attenuation < 0.01f)
4108                                 continue;
4109                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4110                         {
4111                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4112                                 if (attenuation < 0.01f)
4113                                         continue;
4114                         }
4115
4116                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4117                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4118                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4119                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4120                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4121                         {
4122                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4123                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4124                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4125                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4126                         }
4127                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4128                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4129                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4130                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4131                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4132                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4133                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4134                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4135
4136                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4137                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4138                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4139                         DPSOFTRAST_Vector3Normalize(lightnormal);
4140
4141                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4142
4143                         if(thread->shader_exactspecularmath)
4144                         {
4145                                 // reflect lightnormal at surfacenormal, take the negative of that
4146                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4147                                 float f;
4148                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4149                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4150                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4151                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4152
4153                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
4154                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4155                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4156                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4157                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4158
4159                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4160                         }
4161                         else
4162                         {
4163                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4164                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4165                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4166                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4167
4168                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
4169                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
4170                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
4171                                 DPSOFTRAST_Vector3Normalize(specularnormal);
4172
4173                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4174                         }
4175                         specular = pow(specular, SpecularPower * glosstex[3]);
4176
4177                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4178                         {
4179                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4180                                 attenuation *= (1.0f / 255.0f);
4181                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4182                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4183                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4184                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4185                         }
4186                         else
4187                         {
4188                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4189                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4190                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4191                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4192                         }
4193                         buffer_FragColorbgra8[x*4+0] = d[0];
4194                         buffer_FragColorbgra8[x*4+1] = d[1];
4195                         buffer_FragColorbgra8[x*4+2] = d[2];
4196                         buffer_FragColorbgra8[x*4+3] = d[3];
4197                 }
4198         }
4199         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4200         {
4201                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4202                 for (x = startx;x < endx;x++)
4203                 {
4204                         z = buffer_z[x];
4205                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4206                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4207                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4208                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4209                         if (attenuation < 0.01f)
4210                                 continue;
4211                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4212                         {
4213                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4214                                 if (attenuation < 0.01f)
4215                                         continue;
4216                         }
4217
4218                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4219                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4220                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4221                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4222                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4223                         {
4224                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4225                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4226                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4227                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4228                         }
4229                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4230                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4231                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4232                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4233
4234                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4235                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4236                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4237                         DPSOFTRAST_Vector3Normalize(lightnormal);
4238
4239                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4240                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4241                         {
4242                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4243                                 attenuation *= (1.0f / 255.0f);
4244                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4245                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4246                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4247                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4248                         }
4249                         else
4250                         {
4251                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4252                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4253                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4254                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4255                         }
4256                         buffer_FragColorbgra8[x*4+0] = d[0];
4257                         buffer_FragColorbgra8[x*4+1] = d[1];
4258                         buffer_FragColorbgra8[x*4+2] = d[2];
4259                         buffer_FragColorbgra8[x*4+3] = d[3];
4260                 }
4261         }
4262         else
4263         {
4264                 for (x = startx;x < endx;x++)
4265                 {
4266                         z = buffer_z[x];
4267                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4268                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4269                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4270                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4271                         if (attenuation < 0.01f)
4272                                 continue;
4273                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4274                         {
4275                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4276                                 if (attenuation < 0.01f)
4277                                         continue;
4278                         }
4279
4280                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4281                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4282                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4283                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4284                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4285                         {
4286                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4287                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4288                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4289                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4290                         }
4291                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4292                         {
4293                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4294                                 attenuation *= (1.0f / 255.0f);
4295                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4296                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4297                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4298                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4299                         }
4300                         else
4301                         {
4302                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4303                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4304                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4305                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4306                         }
4307                         buffer_FragColorbgra8[x*4+0] = d[0];
4308                         buffer_FragColorbgra8[x*4+1] = d[1];
4309                         buffer_FragColorbgra8[x*4+2] = d[2];
4310                         buffer_FragColorbgra8[x*4+3] = d[3];
4311                 }
4312         }
4313         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4314 #endif
4315 }
4316
4317
4318
4319 void DPSOFTRAST_VertexShader_Refraction(void)
4320 {
4321         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4322         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4323         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4324 }
4325
4326 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4327 {
4328         // DIRTY TRICK: only do sideways displacement. Not correct, but cheaper and thus better for SW.
4329
4330         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4331         float z;
4332         int x, startx = span->startx, endx = span->endx;
4333
4334         // texture reads
4335         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4336         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4337
4338         // varyings
4339         float ModelViewProjectionPositiondata[4];
4340         float ModelViewProjectionPositionslope[4];
4341
4342         // uniforms
4343         float ScreenScaleRefractReflect[2];
4344         float ScreenCenterRefractReflect[2];
4345         float DistortScaleRefractReflect[2];
4346         float RefractColor[4];
4347
4348         const unsigned char * RESTRICT pixelbase;
4349         const unsigned char * RESTRICT pixel[4];
4350         DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4351         if(!texture) return;
4352         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[0][0];
4353
4354         // read textures
4355         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4356         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4357
4358         // read varyings
4359         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
4360
4361         // read uniforms
4362         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4363         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4364         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4365         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4366         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4367         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4368         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4369         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4370         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4371         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4372
4373         // do stuff
4374         for (x = startx;x < endx;x++)
4375         {
4376                 float SafeScreenTexCoord[2];
4377                 float ScreenTexCoord[2];
4378                 float v[3];
4379                 float iw;
4380                 unsigned char c[4];
4381
4382                 z = buffer_z[x];
4383
4384                 // "    vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4385                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4386                 
4387                 // "    vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4388                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4389                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4390
4391                 // "    vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4392                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4393                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4394                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4395                 DPSOFTRAST_Vector3Normalize(v);
4396                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4397                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4398
4399                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4400                 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4401                 {
4402                         unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<12) - 2048, ScreenTexCoord[1] * (texture->mipmap[0][3]<<12) - 2048};
4403                         unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
4404                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
4405                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
4406                         int tci[2] = { tc[0]>>12, tc[1]>>12 };
4407                         int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4408                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4409                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4410                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4411                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4412                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4413                         pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci1[0]);
4414                         pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci[0]);
4415                         pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci1[0]);
4416                         c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
4417                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
4418                         c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
4419                 }
4420                 else
4421                 {
4422                         int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2], ScreenTexCoord[1] * texture->mipmap[0][3] };
4423                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4424                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4425                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4426                         c[0] = pixel[0][0];
4427                         c[1] = pixel[0][1];
4428                         c[2] = pixel[0][2];
4429                 }
4430
4431                 //p = (int) bound(startx, x + (ScreenTexCoord[0] - SafeScreenTexCoord[0]) / (ModelViewProjectionPositionslope[0]*z), endx-1);
4432                 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4433                 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4434                 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4435                 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4436         }
4437
4438         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4439 }
4440
4441
4442
4443 void DPSOFTRAST_VertexShader_Water(void)
4444 {
4445         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4446 }
4447
4448
4449 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4450 {
4451         // TODO: IMPLEMENT
4452         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4453         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4454         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4455         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4456         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4457 }
4458
4459
4460
4461 void DPSOFTRAST_VertexShader_ShowDepth(void)
4462 {
4463         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4464 }
4465
4466 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4467 {
4468         // TODO: IMPLEMENT
4469         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4470         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4471         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4472         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4473         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4474 }
4475
4476
4477
4478 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4479 {
4480         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4481 }
4482
4483 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4484 {
4485         // TODO: IMPLEMENT
4486         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4487         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4488         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4489         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4490         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4491 }
4492
4493
4494
4495 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4496 {
4497         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4498 }
4499
4500 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4501 {
4502         // TODO: IMPLEMENT
4503         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4504         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4505         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4506         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4507         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4508 }
4509
4510
4511
4512 typedef struct DPSOFTRAST_ShaderModeInfo_s
4513 {
4514         int lodarrayindex;
4515         void (*Vertex)(void);
4516         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4517         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4518         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4519 }
4520 DPSOFTRAST_ShaderModeInfo;
4521
4522 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4523 {
4524         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4525         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4526         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4527         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4528         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4529         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4530         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4531         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4532         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4533         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4534         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4535         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4536         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4537         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4538         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4539         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4540 };
4541
4542 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4543 {
4544         int x;
4545         int startx;
4546         int endx;
4547         unsigned int *depthpixel;
4548         int depth;
4549         int depthslope;
4550         unsigned int d;
4551         unsigned char *pixelmask;
4552         DPSOFTRAST_State_Triangle *triangle;
4553         triangle = &thread->triangles[span->triangle];
4554         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4555         startx = span->startx;
4556         endx = span->endx;
4557         depth = span->depthbase;
4558         depthslope = span->depthslope;
4559         pixelmask = thread->pixelmaskarray;
4560         if (thread->depthtest && dpsoftrast.fb_depthpixels)
4561         {
4562                 switch(thread->fb_depthfunc)
4563                 {
4564                 default:
4565                 case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4566                 case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4567                 case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4568                 case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4569                 case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4570                 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4571                 case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4572                 }
4573                 while (startx < endx && !pixelmask[startx])
4574                         startx++;
4575                 while (endx > startx && !pixelmask[endx-1])
4576                         endx--;
4577         }
4578         else
4579         {
4580                 // no depth testing means we're just dealing with color...
4581                 memset(pixelmask + startx, 1, endx - startx);
4582         }
4583         span->pixelmask = pixelmask;
4584         span->startx = startx;
4585         span->endx = endx;
4586 }
4587
4588 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4589 {
4590         int x, d, depth, depthslope, startx, endx;
4591         const unsigned char *pixelmask;
4592         unsigned int *depthpixel;
4593         if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4594         {
4595                 depth = span->depthbase;
4596                 depthslope = span->depthslope;
4597                 pixelmask = span->pixelmask;
4598                 startx = span->startx;
4599                 endx = span->endx;
4600                 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4601                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4602                         if (pixelmask[x])
4603                                 depthpixel[x] = d;
4604         }
4605 }
4606
4607 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4608 {
4609         int i;
4610         DPSOFTRAST_State_Triangle *triangle;
4611         DPSOFTRAST_State_Span *span;
4612         for (i = 0; i < thread->numspans; i++)
4613         {
4614                 span = &thread->spans[i];
4615                 triangle = &thread->triangles[span->triangle];
4616                 DPSOFTRAST_Draw_DepthTest(thread, span);
4617                 if (span->startx >= span->endx)
4618                         continue;
4619                 // run pixel shader if appropriate
4620                 // do this before running depthmask code, to allow the pixelshader
4621                 // to clear pixelmask values for alpha testing
4622                 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4623                         DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4624                 DPSOFTRAST_Draw_DepthWrite(thread, span);
4625         }
4626         thread->numspans = 0;
4627 }
4628
4629 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4630
4631 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4632 {
4633 #ifdef SSE_POSSIBLE
4634         int cullface = thread->cullface;
4635         int minx, maxx, miny, maxy;
4636         int miny1, maxy1, miny2, maxy2;
4637         __m128i fbmin, fbmax;
4638         __m128 viewportcenter, viewportscale;
4639         int firstvertex = command->firstvertex;
4640         int numvertices = command->numvertices;
4641         int numtriangles = command->numtriangles;
4642         const int *element3i = command->element3i;
4643         const unsigned short *element3s = command->element3s;
4644         int clipped = command->clipped;
4645         int i;
4646         int j;
4647         int k;
4648         int y;
4649         int e[3];
4650         __m128i screeny;
4651         int starty, endy, bandy;
4652         int numpoints;
4653         int clipcase;
4654         float clipdist[4];
4655         float clip0origin, clip0slope;
4656         int clip0dir;
4657         __m128 triangleedge1, triangleedge2, trianglenormal;
4658         __m128 clipfrac[3];
4659         __m128 screen[4];
4660         DPSOFTRAST_State_Triangle *triangle;
4661         DPSOFTRAST_Texture *texture;
4662         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4663         miny = thread->fb_scissor[1];
4664         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4665         miny1 = bound(miny, thread->miny1, maxy);
4666         maxy1 = bound(miny, thread->maxy1, maxy);
4667         miny2 = bound(miny, thread->miny2, maxy);
4668         maxy2 = bound(miny, thread->maxy2, maxy);
4669         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4670         {
4671                 if (!ATOMIC_DECREMENT(command->refcount))
4672                 {
4673                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4674                                 MM_FREE(command->arrays);
4675                 }
4676                 return;
4677         }
4678         minx = thread->fb_scissor[0];
4679         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4680         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4681         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4682         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4683         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4684         screen[3] = _mm_setzero_ps();
4685         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4686         for (i = 0;i < numtriangles;i++)
4687         {
4688                 const float *screencoord4f = command->arrays;
4689                 const float *arrays = screencoord4f + numvertices*4;
4690
4691                 // generate the 3 edges of this triangle
4692                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4693                 if (element3s)
4694                 {
4695                         e[0] = element3s[i*3+0] - firstvertex;
4696                         e[1] = element3s[i*3+1] - firstvertex;
4697                         e[2] = element3s[i*3+2] - firstvertex;
4698                 }
4699                 else if (element3i)
4700                 {
4701                         e[0] = element3i[i*3+0] - firstvertex;
4702                         e[1] = element3i[i*3+1] - firstvertex;
4703                         e[2] = element3i[i*3+2] - firstvertex;
4704                 }
4705                 else
4706                 {
4707                         e[0] = i*3+0;
4708                         e[1] = i*3+1;
4709                         e[2] = i*3+2;
4710                 }
4711
4712 #define SKIPBACKFACE \
4713                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4714                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4715                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4716                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4717                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4718                 switch(cullface) \
4719                 { \
4720                 case GL_BACK: \
4721                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4722                                 continue; \
4723                         break; \
4724                 case GL_FRONT: \
4725                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4726                                 continue; \
4727                         break; \
4728                 }
4729
4730 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4731                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4732                         { \
4733                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4734                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4735                         }
4736 #define CLIPPEDVERTEXCOPY(k,p1) \
4737                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4738
4739 #define GENATTRIBCOPY(attrib, p1) \
4740                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4741 #define GENATTRIBLERP(attrib, p1, p2) \
4742                 { \
4743                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4744                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4745                 }
4746 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4747                 switch(clipcase) \
4748                 { \
4749                 default: \
4750                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4751                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4752                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4753                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4754                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4755                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4756                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4757                 }
4758
4759                 if (! clipped)
4760                         goto notclipped;
4761
4762                 // calculate distance from nearplane
4763                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4764                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4765                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4766                 if (clipdist[0] >= 0.0f)
4767                 {
4768                         if (clipdist[1] >= 0.0f)
4769                         {
4770                                 if (clipdist[2] >= 0.0f)
4771                                 {
4772                                 notclipped:
4773                                         // triangle is entirely in front of nearplane
4774                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4775                                         SKIPBACKFACE;
4776                                         numpoints = 3;
4777                                         clipcase = 0;
4778                                 }
4779                                 else
4780                                 {
4781                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4782                                         SKIPBACKFACE;
4783                                         numpoints = 4;
4784                                         clipcase = 1;
4785                                 }
4786                         }
4787                         else
4788                         {
4789                                 if (clipdist[2] >= 0.0f)
4790                                 {
4791                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4792                                         SKIPBACKFACE;
4793                                         numpoints = 4;
4794                                         clipcase = 2;
4795                                 }
4796                                 else
4797                                 {
4798                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4799                                         SKIPBACKFACE;
4800                                         numpoints = 3;
4801                                         clipcase = 3;
4802                                 }
4803                         }
4804                 }
4805                 else if (clipdist[1] >= 0.0f)
4806                 {
4807                         if (clipdist[2] >= 0.0f)
4808                         {
4809                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4810                                 SKIPBACKFACE;
4811                                 numpoints = 4;
4812                                 clipcase = 4;
4813                         }
4814                         else
4815                         {
4816                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4817                                 SKIPBACKFACE;
4818                                 numpoints = 3;
4819                                 clipcase = 5;
4820                         }
4821                 }
4822                 else if (clipdist[2] >= 0.0f)
4823                 {
4824                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4825                         SKIPBACKFACE;
4826                         numpoints = 3;
4827                         clipcase = 6;
4828                 }
4829                 else continue; // triangle is entirely behind nearplane
4830
4831                 {
4832                         // calculate integer y coords for triangle points
4833                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4834                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4835                                         screenmin = _mm_min_epi16(screeni, screenir),
4836                                         screenmax = _mm_max_epi16(screeni, screenir);
4837                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4838                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4839                         screenmin = _mm_max_epi16(screenmin, fbmin);
4840                         screenmax = _mm_min_epi16(screenmax, fbmax);
4841                         // skip offscreen triangles
4842                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4843                                 continue;
4844                         starty = _mm_extract_epi16(screenmin, 1);
4845                         endy = _mm_extract_epi16(screenmax, 1)+1;
4846                         if (starty >= maxy1 && endy <= miny2)
4847                                 continue;
4848                         screeny = _mm_srai_epi32(screeni, 16);
4849                 }
4850
4851                 triangle = &thread->triangles[thread->numtriangles];
4852
4853                 // calculate attribute plans for triangle data...
4854                 // okay, this triangle is going to produce spans, we'd better project
4855                 // the interpolants now (this is what gives perspective texturing),
4856                 // this consists of simply multiplying all arrays by the W coord
4857                 // (which is basically 1/Z), which will be undone per-pixel
4858                 // (multiplying by Z again) to get the perspective-correct array
4859                 // values
4860                 {
4861                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4862                         __m128 mipedgescale, mipdensity;
4863                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4864                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4865                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4866                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4867                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4868                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4869                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4870                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4871                         attribedge1 = _mm_sub_ss(w0, w1);
4872                         attribedge2 = _mm_sub_ss(w2, w1);
4873                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4874                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4875                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4876                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4877                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4878                         _mm_store_ss(&triangle->w[0], attribxslope);
4879                         _mm_store_ss(&triangle->w[1], attribyslope);
4880                         _mm_store_ss(&triangle->w[2], attriborigin);
4881                         
4882                         clip0origin = 0;
4883                         clip0slope = 0;
4884                         clip0dir = 0;
4885                         if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
4886                         {
4887                                 float cliporigin, clipxslope, clipyslope;
4888                                 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
4889                                 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
4890                                 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
4891                                 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4892                                 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4893                                 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4894                                 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
4895                                 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
4896                                 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
4897                                 if(clipxslope != 0)
4898                                 {
4899                                         clip0origin = -cliporigin/clipxslope;
4900                                         clip0slope = -clipyslope/clipxslope;
4901                                         clip0dir = clipxslope > 0 ? 1 : -1;
4902                                 }
4903                                 else if(clipyslope > 0)
4904                                 {
4905                                         clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
4906                                         clip0slope = dpsoftrast.fb_width;
4907                                         clip0dir = -1;
4908                                 }
4909                                 else if(clipyslope < 0)
4910                                 {
4911                                         clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
4912                                         clip0slope = -dpsoftrast.fb_width;
4913                                         clip0dir = -1;
4914                                 }
4915                                 else if(clip0origin < 0) continue;
4916                         }
4917
4918                         mipedgescale = _mm_setzero_ps();
4919                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4920                         {
4921                                 __m128 attrib0, attrib1, attrib2;
4922                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4923                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4924                                         break;
4925                                 arrays += numvertices*4;
4926                                 GENATTRIBS(attrib0, attrib1, attrib2);
4927                                 attriborigin = _mm_mul_ps(attrib1, w1);
4928                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4929                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4930                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4931                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4932                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4933                                 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
4934                                 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
4935                                 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
4936                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4937                                 {
4938                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4939                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4940                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4941                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4942                                 }
4943                         }
4944
4945                         memset(triangle->mip, 0, sizeof(triangle->mip));
4946                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4947                         {
4948                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4949                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4950                                         break;
4951                                 texture = thread->texbound[texunit];
4952                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4953                                 {
4954                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4955                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4956                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4957                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4958                                         // this will be multiplied in the texturing routine by the texture resolution
4959                                         y = _mm_cvtss_si32(mipdensity);
4960                                         if (y > 0)
4961                                         {
4962                                                 y = (int)(log((float)y)*0.5f/M_LN2);
4963                                                 if (y > texture->mipmaps - 1)
4964                                                         y = texture->mipmaps - 1;
4965                                                 triangle->mip[texunit] = y;
4966                                         }
4967                                 }
4968                         }
4969                 }
4970         
4971                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4972                 for (; y < bandy;)
4973                 {
4974                         __m128 xcoords, xslope;
4975                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4976                         int yccmask = _mm_movemask_epi8(ycc);
4977                         int edge0p, edge0n, edge1p, edge1n;
4978                         int nexty;
4979                         float w, wslope;
4980                         float clip0;
4981                         if (numpoints == 4)
4982                         {
4983                                 switch(yccmask)
4984                                 {
4985                                 default:
4986                                 case 0xFFFF: /*0000*/ y = endy; continue;
4987                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4988                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4989                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4990                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4991                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4992                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4993                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4994                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4995                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4996                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4997                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4998                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4999                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5000                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5001                                 case 0x0000: /*1111*/ y++; continue;
5002                                 }
5003                         }
5004                         else
5005                         {
5006                                 switch(yccmask)
5007                                 {
5008                                 default:
5009                                 case 0xFFFF: /*000*/ y = endy; continue;
5010                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5011                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5012                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5013                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5014                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5015                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5016                                 case 0x0000: /*111*/ y++; continue;
5017                                 }
5018                         }
5019                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5020                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5021                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5022                         nexty = _mm_extract_epi16(ycc, 0);
5023                         if (nexty >= bandy) nexty = bandy-1;
5024                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5025                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5026                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5027                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5028                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5029                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5030                         {
5031                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5032                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5033                         }
5034                         clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5035                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5036                         {
5037                                 int startx, endx, clipx = minx, offset;
5038                                 startx = _mm_cvtss_si32(xcoords);
5039                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5040                                 if (startx < minx) 
5041                                 {
5042                                         if (startx < 0) startx = 0;
5043                                         startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
5044                                 }
5045                                 if (endx > maxx) endx = maxx;
5046                                 if (startx >= endx) continue;
5047
5048                                 if (clip0dir)
5049                                 {
5050                                         if (clip0dir > 0)
5051                                         {
5052                                                 if (startx < clip0) 
5053                                                 {
5054                                                         if(endx <= clip0) continue;
5055                                                         clipx = max((int)clip0, minx);
5056                                                         startx += (clipx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1); 
5057                                                 }
5058                                         }
5059                                         else if (endx > clip0) 
5060                                         {
5061                                                 if(startx >= clip0) continue;
5062                                                 endx = (int)clip0;
5063                                         }
5064                                 }
5065                                                 
5066                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5067                                 {
5068                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5069                                         span->triangle = thread->numtriangles;
5070                                         span->x = offset;
5071                                         span->y = y;
5072                                         span->startx = max(clipx - offset, 0);
5073                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5074                                         if (span->startx >= span->endx)
5075                                                 continue;
5076                                         wslope = triangle->w[0];
5077                                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5078                                         span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5079                                         span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5080                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5081                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
5082                                 }
5083                         }
5084                 }
5085
5086                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5087                 {
5088                         DPSOFTRAST_Draw_ProcessSpans(thread);
5089                         thread->numtriangles = 0;
5090                 }
5091         }
5092
5093         if (!ATOMIC_DECREMENT(command->refcount))
5094         {
5095                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5096                         MM_FREE(command->arrays);
5097         }
5098
5099         if (thread->numspans > 0 || thread->numtriangles > 0)
5100         {
5101                 DPSOFTRAST_Draw_ProcessSpans(thread);
5102                 thread->numtriangles = 0;
5103         }
5104 #endif
5105 }
5106
5107 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5108 {
5109         int i;
5110         int j;
5111         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5112         int datasize = 2*numvertices*sizeof(float[4]);
5113         DPSOFTRAST_Command_Draw *command;
5114         unsigned char *data;
5115         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5116         {
5117                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5118                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5119                         break;
5120                 datasize += numvertices*sizeof(float[4]);
5121         }
5122         if (element3s)
5123                 datasize += numtriangles*sizeof(unsigned short[3]);
5124         else if (element3i)
5125                 datasize += numtriangles*sizeof(int[3]);
5126         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5127         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5128         {
5129                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5130                 data = (unsigned char *)MM_CALLOC(datasize, 1);
5131         }
5132         else
5133         {
5134                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5135                 data = (unsigned char *)command + commandsize;
5136         }
5137         command->firstvertex = firstvertex;
5138         command->numvertices = numvertices;
5139         command->numtriangles = numtriangles;
5140         command->arrays = (float *)data;
5141         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5142         dpsoftrast.firstvertex = firstvertex;
5143         dpsoftrast.numvertices = numvertices;
5144         dpsoftrast.screencoord4f = (float *)data;
5145         data += numvertices*sizeof(float[4]);
5146         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5147         data += numvertices*sizeof(float[4]);
5148         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5149         {
5150                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5151                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5152                         break;
5153                 dpsoftrast.post_array4f[j] = (float *)data;
5154                 data += numvertices*sizeof(float[4]);
5155         }
5156         command->element3i = NULL;
5157         command->element3s = NULL;
5158         if (element3s)
5159         {
5160                 command->element3s = (unsigned short *)data;
5161                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5162         }
5163         else if (element3i)
5164         {
5165                 command->element3i = (int *)data;
5166                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5167         }
5168         return command;
5169 }
5170
5171 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5172 {
5173         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5174         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5175         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5176         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5177         if (command->starty >= command->endy)
5178         {
5179                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5180                         MM_FREE(command->arrays);
5181                 DPSOFTRAST_UndoCommand(command->commandsize);
5182                 return;
5183         }
5184         command->clipped = dpsoftrast.drawclipped;
5185         command->refcount = dpsoftrast.numthreads;
5186
5187         if (dpsoftrast.usethreads)
5188         {
5189                 int i;
5190                 DPSOFTRAST_Draw_SyncCommands();
5191                 for (i = 0; i < dpsoftrast.numthreads; i++)
5192                 {
5193                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5194                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5195                                 Thread_CondSignal(thread->drawcond);
5196                 }
5197         }
5198         else
5199         {
5200                 DPSOFTRAST_Draw_FlushThreads();
5201         }
5202 }
5203
5204 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5205 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5206 {
5207         thread->validate |= DPSOFTRAST_VALIDATE_FB;
5208 }
5209 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5210 {
5211         DPSOFTRAST_Command_SetRenderTargets *command;
5212         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5213                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5214                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5215                 DPSOFTRAST_Flush();
5216         dpsoftrast.fb_width = width;
5217         dpsoftrast.fb_height = height;
5218         dpsoftrast.fb_depthpixels = depthpixels;
5219         dpsoftrast.fb_colorpixels[0] = colorpixels0;
5220         dpsoftrast.fb_colorpixels[1] = colorpixels1;
5221         dpsoftrast.fb_colorpixels[2] = colorpixels2;
5222         dpsoftrast.fb_colorpixels[3] = colorpixels3;
5223         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5224         command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5225         command->width = width;
5226         command->height = height;
5227 }
5228  
5229 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5230 {
5231         int commandoffset = thread->commandoffset;
5232         while (commandoffset != endoffset)
5233         {
5234                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5235                 switch (command->opcode)
5236                 {
5237 #define INTERPCOMMAND(name) \
5238                 case DPSOFTRAST_OPCODE_##name : \
5239                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5240                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5241                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5242                                 commandoffset = 0; \
5243                         break;
5244                 INTERPCOMMAND(Viewport)
5245                 INTERPCOMMAND(ClearColor)
5246                 INTERPCOMMAND(ClearDepth)
5247                 INTERPCOMMAND(ColorMask)
5248                 INTERPCOMMAND(DepthTest)
5249                 INTERPCOMMAND(ScissorTest)
5250                 INTERPCOMMAND(Scissor)
5251                 INTERPCOMMAND(BlendFunc)
5252                 INTERPCOMMAND(BlendSubtract)
5253                 INTERPCOMMAND(DepthMask)
5254                 INTERPCOMMAND(DepthFunc)
5255                 INTERPCOMMAND(DepthRange)
5256                 INTERPCOMMAND(PolygonOffset)
5257                 INTERPCOMMAND(CullFace)
5258                 INTERPCOMMAND(AlphaTest)
5259                 INTERPCOMMAND(AlphaFunc)
5260                 INTERPCOMMAND(SetTexture)
5261                 INTERPCOMMAND(SetShader)
5262                 INTERPCOMMAND(Uniform4f)
5263                 INTERPCOMMAND(UniformMatrix4f)
5264                 INTERPCOMMAND(Uniform1i)
5265                 INTERPCOMMAND(SetRenderTargets)
5266                 INTERPCOMMAND(ClipPlane)
5267
5268                 case DPSOFTRAST_OPCODE_Draw:
5269                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5270                         commandoffset += command->commandsize;
5271                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5272                                 commandoffset = 0;
5273                         thread->commandoffset = commandoffset;
5274                         break;
5275
5276                 case DPSOFTRAST_OPCODE_Reset:
5277                         commandoffset = 0;
5278                         break;
5279                 }
5280         }
5281         thread->commandoffset = commandoffset;
5282 }
5283
5284 static int DPSOFTRAST_Draw_Thread(void *data)
5285 {
5286         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5287         while(thread->index >= 0)
5288         {
5289                 if (thread->commandoffset != dpsoftrast.drawcommand)
5290                 {
5291                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5292                 }
5293                 else 
5294                 {
5295                         Thread_LockMutex(thread->drawmutex);
5296                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5297                         {
5298                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5299                                 thread->starving = true;
5300                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5301                                 thread->starving = false;
5302                         }
5303                         Thread_UnlockMutex(thread->drawmutex);
5304                 }
5305         }   
5306         return 0;
5307 }
5308
5309 static void DPSOFTRAST_Draw_FlushThreads(void)
5310 {
5311         DPSOFTRAST_State_Thread *thread;
5312         int i;
5313         DPSOFTRAST_Draw_SyncCommands();
5314         if (dpsoftrast.usethreads) 
5315         {
5316                 for (i = 0; i < dpsoftrast.numthreads; i++)
5317                 {
5318                         thread = &dpsoftrast.threads[i];
5319                         if (thread->commandoffset != dpsoftrast.drawcommand)
5320                         {
5321                                 Thread_LockMutex(thread->drawmutex);
5322                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5323                                         Thread_CondSignal(thread->drawcond);
5324                                 Thread_UnlockMutex(thread->drawmutex);
5325                         }
5326                 }
5327                 for (i = 0; i < dpsoftrast.numthreads; i++)
5328                 {
5329                         thread = &dpsoftrast.threads[i];
5330                         if (thread->commandoffset != dpsoftrast.drawcommand)
5331                         {
5332                                 Thread_LockMutex(thread->drawmutex);
5333                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5334                                 {
5335                                         thread->waiting = true;
5336                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5337                                         thread->waiting = false;
5338                                 }
5339                                 Thread_UnlockMutex(thread->drawmutex);
5340                         }
5341                 }
5342         }
5343         else
5344         {
5345                 for (i = 0; i < dpsoftrast.numthreads; i++)
5346                 {
5347                         thread = &dpsoftrast.threads[i];
5348                         if (thread->commandoffset != dpsoftrast.drawcommand)
5349                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5350                 }
5351         }
5352         dpsoftrast.commandpool.usedcommands = 0;
5353 }
5354
5355 void DPSOFTRAST_Flush(void)
5356 {
5357         DPSOFTRAST_Draw_FlushThreads();
5358 }
5359
5360 void DPSOFTRAST_Finish(void)
5361 {
5362         DPSOFTRAST_Flush();
5363 }
5364
5365 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5366 {
5367         int i;
5368         union
5369         {
5370                 int i;
5371                 unsigned char b[4];
5372         }
5373         u;
5374         u.i = 1;
5375         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5376         dpsoftrast.bigendian = u.b[3];
5377         dpsoftrast.fb_width = width;
5378         dpsoftrast.fb_height = height;
5379         dpsoftrast.fb_depthpixels = depthpixels;
5380         dpsoftrast.fb_colorpixels[0] = colorpixels;
5381         dpsoftrast.fb_colorpixels[1] = NULL;
5382         dpsoftrast.fb_colorpixels[1] = NULL;
5383         dpsoftrast.fb_colorpixels[1] = NULL;
5384         dpsoftrast.viewport[0] = 0;
5385         dpsoftrast.viewport[1] = 0;
5386         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5387         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5388         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5389         dpsoftrast.texture_firstfree = 1;
5390         dpsoftrast.texture_end = 1;
5391         dpsoftrast.texture_max = 0;
5392         dpsoftrast.color[0] = 1;
5393         dpsoftrast.color[1] = 1;
5394         dpsoftrast.color[2] = 1;
5395         dpsoftrast.color[3] = 1;
5396         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5397         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5398         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5399         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5400         for (i = 0; i < dpsoftrast.numthreads; i++)
5401         {
5402                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5403                 thread->index = i;
5404                 thread->cullface = GL_BACK;
5405                 thread->colormask[1] = 1;
5406                 thread->colormask[2] = 1;
5407                 thread->colormask[3] = 1;
5408                 thread->blendfunc[0] = GL_ONE;
5409                 thread->blendfunc[1] = GL_ZERO;
5410                 thread->depthmask = true;
5411                 thread->depthtest = true;
5412                 thread->depthfunc = GL_LEQUAL;
5413                 thread->scissortest = false;
5414                 thread->alphatest = false;
5415                 thread->alphafunc = GL_GREATER;
5416                 thread->alphavalue = 0.5f;
5417                 thread->viewport[0] = 0;
5418                 thread->viewport[1] = 0;
5419                 thread->viewport[2] = dpsoftrast.fb_width;
5420                 thread->viewport[3] = dpsoftrast.fb_height;
5421                 thread->scissor[0] = 0;
5422                 thread->scissor[1] = 0;
5423                 thread->scissor[2] = dpsoftrast.fb_width;
5424                 thread->scissor[3] = dpsoftrast.fb_height;
5425                 thread->depthrange[0] = 0;
5426                 thread->depthrange[1] = 1;
5427                 thread->polygonoffset[0] = 0;
5428                 thread->polygonoffset[1] = 0;
5429                 thread->clipplane[0] = 0;
5430                 thread->clipplane[1] = 0;
5431                 thread->clipplane[2] = 0;
5432                 thread->clipplane[3] = 1;
5433         
5434                 thread->numspans = 0;
5435                 thread->numtriangles = 0;
5436                 thread->commandoffset = 0;
5437                 thread->waiting = false;
5438                 thread->starving = false;
5439            
5440                 thread->validate = -1;
5441                 DPSOFTRAST_Validate(thread, -1);
5442  
5443                 if (dpsoftrast.usethreads)
5444                 {
5445                         thread->waitcond = Thread_CreateCond();
5446                         thread->drawcond = Thread_CreateCond();
5447                         thread->drawmutex = Thread_CreateMutex();
5448                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5449                 }
5450         }
5451         return 0;
5452 }
5453
5454 void DPSOFTRAST_Shutdown(void)
5455 {
5456         int i;
5457         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5458         {
5459                 DPSOFTRAST_State_Thread *thread;
5460                 for (i = 0; i < dpsoftrast.numthreads; i++)
5461                 {
5462                         thread = &dpsoftrast.threads[i];
5463                         Thread_LockMutex(thread->drawmutex);
5464                         thread->index = -1;
5465                         Thread_CondSignal(thread->drawcond);
5466                         Thread_UnlockMutex(thread->drawmutex);
5467                         Thread_WaitThread(thread->thread, 0);
5468                         Thread_DestroyCond(thread->waitcond);
5469                         Thread_DestroyCond(thread->drawcond);
5470                         Thread_DestroyMutex(thread->drawmutex);
5471                 }
5472         }
5473         for (i = 0;i < dpsoftrast.texture_end;i++)
5474                 if (dpsoftrast.texture[i].bytes)
5475                         MM_FREE(dpsoftrast.texture[i].bytes);
5476         if (dpsoftrast.texture)
5477                 free(dpsoftrast.texture);
5478         if (dpsoftrast.threads)
5479                 MM_FREE(dpsoftrast.threads);
5480         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5481 }
5482