]> git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
Turn off RFC 1149 on systems that have it enabled.
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifdef _MSC_VER
10 #pragma warning(disable : 4324)
11 #endif
12
13 #ifndef __cplusplus
14 typedef qboolean bool;
15 #endif
16
17 #define ALIGN_SIZE 16
18 #define ATOMIC_SIZE 4
19
20 #ifdef SSE_POSSIBLE
21         #if defined(__APPLE__)
22                 #include <libkern/OSAtomic.h>
23                 #define ALIGN(var) var __attribute__((__aligned__(16)))
24                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
25                 #define MEMORY_BARRIER (_mm_sfence())
26                 #define ATOMIC_COUNTER volatile int32_t 
27                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30         #elif defined(__GNUC__) && defined(WIN32)
31                 #define ALIGN(var) var __attribute__((__aligned__(16)))
32                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
33                 #define MEMORY_BARRIER (_mm_sfence())
34                 //(__sync_synchronize())
35                 #define ATOMIC_COUNTER volatile LONG
36                 // this LONG * cast serves to fix an issue with broken mingw
37                 // packages on Ubuntu; these only declare the function to take
38                 // a LONG *, causing a compile error here. This seems to be
39                 // error- and warn-free on platforms that DO declare
40                 // InterlockedIncrement correctly, like mingw on Windows.
41                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement((LONG *) &(counter)))
42                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement((LONG *) &(counter)))
43                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd((LONG *) &(counter), (val)))
44         #elif defined(__GNUC__)
45                 #define ALIGN(var) var __attribute__((__aligned__(16)))
46                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
47                 #define MEMORY_BARRIER (_mm_sfence())
48                 //(__sync_synchronize())
49                 #define ATOMIC_COUNTER volatile int
50                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
51                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
52                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
53         #elif defined(_MSC_VER)
54                 #define ALIGN(var) __declspec(align(16)) var
55                 #define ATOMIC(var) __declspec(align(4)) var
56                 #define MEMORY_BARRIER (_mm_sfence())
57                 //(MemoryBarrier())
58                 #define ATOMIC_COUNTER volatile LONG
59                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
60                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
61                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
62         #endif
63 #endif
64
65 #ifndef ALIGN
66 #define ALIGN(var) var
67 #endif
68 #ifndef ATOMIC
69 #define ATOMIC(var) var
70 #endif
71 #ifndef MEMORY_BARRIER
72 #define MEMORY_BARRIER ((void)0)
73 #endif
74 #ifndef ATOMIC_COUNTER
75 #define ATOMIC_COUNTER int
76 #endif
77 #ifndef ATOMIC_INCREMENT
78 #define ATOMIC_INCREMENT(counter) (++(counter))
79 #endif
80 #ifndef ATOMIC_DECREMENT
81 #define ATOMIC_DECREMENT(counter) (--(counter))
82 #endif
83 #ifndef ATOMIC_ADD
84 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
85 #endif
86
87 #ifdef SSE_POSSIBLE
88 #include <emmintrin.h>
89
90 #if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6) && !defined(__clang__)
91         #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
92 #endif
93
94 #define MM_MALLOC(size) _mm_malloc(size, ALIGN_SIZE)
95
96 static void *MM_CALLOC(size_t nmemb, size_t size)
97 {
98         void *ptr = _mm_malloc(nmemb*size, ALIGN_SIZE);
99         if (ptr != NULL) memset(ptr, 0, nmemb*size);
100         return ptr;
101 }
102
103 #define MM_FREE _mm_free
104 #else
105 #define MM_MALLOC(size) malloc(size)
106 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
107 #define MM_FREE free
108 #endif
109
110 typedef enum DPSOFTRAST_ARRAY_e
111 {
112         DPSOFTRAST_ARRAY_POSITION,
113         DPSOFTRAST_ARRAY_COLOR,
114         DPSOFTRAST_ARRAY_TEXCOORD0,
115         DPSOFTRAST_ARRAY_TEXCOORD1,
116         DPSOFTRAST_ARRAY_TEXCOORD2,
117         DPSOFTRAST_ARRAY_TEXCOORD3,
118         DPSOFTRAST_ARRAY_TEXCOORD4,
119         DPSOFTRAST_ARRAY_TEXCOORD5,
120         DPSOFTRAST_ARRAY_TEXCOORD6,
121         DPSOFTRAST_ARRAY_TEXCOORD7,
122         DPSOFTRAST_ARRAY_TOTAL
123 }
124 DPSOFTRAST_ARRAY;
125
126 typedef struct DPSOFTRAST_Texture_s
127 {
128         int flags;
129         int width;
130         int height;
131         int depth;
132         int sides;
133         DPSOFTRAST_TEXTURE_FILTER filter;
134         int mipmaps;
135         int size;
136         ATOMIC_COUNTER binds;
137         unsigned char *bytes;
138         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
139 }
140 DPSOFTRAST_Texture;
141
142 #define COMMAND_SIZE ALIGN_SIZE
143 #define COMMAND_ALIGN(var) ALIGN(var)
144
145 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
146 {
147         unsigned char opcode;
148         unsigned short commandsize;
149 }
150 DPSOFTRAST_Command);
151
152 enum { DPSOFTRAST_OPCODE_Reset = 0 };
153
154 #define DEFCOMMAND(opcodeval, name, fields) \
155         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
156         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
157         { \
158                 unsigned char opcode; \
159                 unsigned short commandsize; \
160                 fields \
161         } DPSOFTRAST_Command_##name );
162
163 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
164 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
165
166 typedef ALIGN(struct DPSOFTRAST_State_Command_Pool_s
167 {
168         int freecommand;
169         int usedcommands;
170         ALIGN(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
171 }
172 DPSOFTRAST_State_Command_Pool);
173
174 typedef ALIGN(struct DPSOFTRAST_State_Triangle_s
175 {
176         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
177         float w[3];
178         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
179 }
180 DPSOFTRAST_State_Triangle);
181
182 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
183         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
184         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
185                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
186                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
187 }
188 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
189         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
190         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
191         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
192         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
193         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
194         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
195         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
196         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
197 }
198                                         
199 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
200
201 typedef ALIGN(struct DPSOFTRAST_State_Span_s
202 {
203         int triangle; // triangle this span was generated by
204         int x; // framebuffer x coord
205         int y; // framebuffer y coord
206         int startx; // usable range (according to pixelmask)
207         int endx; // usable range (according to pixelmask)
208         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
209         int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
210         int depthslope; // depthbuffer value pixel delta
211 }
212 DPSOFTRAST_State_Span);
213
214 #define DPSOFTRAST_DRAW_MAXSPANS 1024
215 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
216 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
217
218 #define DPSOFTRAST_VALIDATE_FB 1
219 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
220 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
221 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
222
223 typedef enum DPSOFTRAST_BLENDMODE_e
224 {
225         DPSOFTRAST_BLENDMODE_OPAQUE,
226         DPSOFTRAST_BLENDMODE_ALPHA,
227         DPSOFTRAST_BLENDMODE_ADDALPHA,
228         DPSOFTRAST_BLENDMODE_ADD,
229         DPSOFTRAST_BLENDMODE_INVMOD,
230         DPSOFTRAST_BLENDMODE_MUL,
231         DPSOFTRAST_BLENDMODE_MUL2,
232         DPSOFTRAST_BLENDMODE_SUBALPHA,
233         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
234         DPSOFTRAST_BLENDMODE_INVADD,
235         DPSOFTRAST_BLENDMODE_TOTAL
236 }
237 DPSOFTRAST_BLENDMODE;
238
239 typedef ALIGN(struct DPSOFTRAST_State_Thread_s
240 {
241         void *thread;
242         int index;
243         
244         int cullface;
245         int colormask[4];
246         int blendfunc[2];
247         int blendsubtract;
248         int depthmask;
249         int depthtest;
250         int depthfunc;
251         int scissortest;
252         int viewport[4];
253         int scissor[4];
254         float depthrange[2];
255         float polygonoffset[2];
256         float clipplane[4];
257         ALIGN(float fb_clipplane[4]);
258
259         int shader_mode;
260         int shader_permutation;
261         int shader_exactspecularmath;
262
263         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
264         
265         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
266         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
267
268         // DPSOFTRAST_VALIDATE_ flags
269         int validate;
270
271         // derived values (DPSOFTRAST_VALIDATE_FB)
272         int fb_colormask;
273         int fb_scissor[4];
274         ALIGN(float fb_viewportcenter[4]);
275         ALIGN(float fb_viewportscale[4]);
276
277         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
278         int fb_depthfunc;
279
280         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
281         int fb_blendmode;
282
283         // band boundaries
284         int miny1;
285         int maxy1;
286         int miny2;
287         int maxy2;
288
289         ATOMIC(volatile int commandoffset);
290
291         volatile bool waiting;
292         volatile bool starving;
293         void *waitcond;
294         void *drawcond;
295         void *drawmutex;
296
297         int numspans;
298         int numtriangles;
299         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
300         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
301         unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
302 }
303 DPSOFTRAST_State_Thread);
304
305 typedef ALIGN(struct DPSOFTRAST_State_s
306 {
307         int fb_width;
308         int fb_height;
309         unsigned int *fb_depthpixels;
310         unsigned int *fb_colorpixels[4];
311
312         int viewport[4];
313         ALIGN(float fb_viewportcenter[4]);
314         ALIGN(float fb_viewportscale[4]);
315
316         float color[4];
317         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
318         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
319
320         const float *pointer_vertex3f;
321         const float *pointer_color4f;
322         const unsigned char *pointer_color4ub;
323         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
324         int stride_vertex;
325         int stride_color;
326         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
327         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
328         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
329
330         int firstvertex;
331         int numvertices;
332         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
333         float *screencoord4f;
334         int drawstarty;
335         int drawendy;
336         int drawclipped;
337         
338         int shader_mode;
339         int shader_permutation;
340         int shader_exactspecularmath;
341
342         int texture_max;
343         int texture_end;
344         int texture_firstfree;
345         DPSOFTRAST_Texture *texture;
346
347         int bigendian;
348
349         // error reporting
350         const char *errorstring;
351
352         bool usethreads;
353         int interlace;
354         int numthreads;
355         DPSOFTRAST_State_Thread *threads;
356
357         ATOMIC(volatile int drawcommand);
358
359         DPSOFTRAST_State_Command_Pool commandpool;
360 }
361 DPSOFTRAST_State);
362
363 DPSOFTRAST_State dpsoftrast;
364
365 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
366 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
367 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
368 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
369
370 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
371 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
372
373 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
374 {
375         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
376         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
377         fb_viewportcenter[3] = 0.5f;
378         fb_viewportcenter[0] = 0.0f;
379         fb_viewportscale[1] = 0.5f * viewport[2];
380         fb_viewportscale[2] = -0.5f * viewport[3];
381         fb_viewportscale[3] = 0.5f;
382         fb_viewportscale[0] = 1.0f;
383 }
384
385 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
386 {
387         if (dpsoftrast.interlace)
388         {
389                 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
390                 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
391                 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
392                 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
393         }
394         else
395         {
396                 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
397                 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
398         }
399 }
400
401 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
402 {
403         thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
404         thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
405         thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
406         thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
407         thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
408 }
409
410 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
411 {
412         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
413         // and viewport projection values
414         int x1, x2;
415         int y1, y2;
416         x1 = thread->scissor[0];
417         x2 = thread->scissor[0] + thread->scissor[2];
418         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
419         y2 = dpsoftrast.fb_height - thread->scissor[1];
420         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
421         if (x1 < 0) x1 = 0;
422         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
423         if (y1 < 0) y1 = 0;
424         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
425         thread->fb_scissor[0] = x1;
426         thread->fb_scissor[1] = y1;
427         thread->fb_scissor[2] = x2 - x1;
428         thread->fb_scissor[3] = y2 - y1;
429
430         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
431         DPSOFTRAST_RecalcClipPlane(thread);
432         DPSOFTRAST_RecalcThread(thread);
433 }
434
435 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
436 {
437         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
438 }
439
440 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
441 {
442         if (thread->blendsubtract)
443         {
444                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
445                 {
446                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
447                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
448                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
449                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
450                 }
451         }
452         else
453         {       
454                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
455                 {
456                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
457                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
458                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
459                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
460                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
461                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
462                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
463                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
464                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
465                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
466                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
467                 }
468         }
469 }
470
471 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
472
473 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
474 {
475         mask &= thread->validate;
476         if (!mask)
477                 return;
478         if (mask & DPSOFTRAST_VALIDATE_FB)
479         {
480                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
481                 DPSOFTRAST_RecalcFB(thread);
482         }
483         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
484         {
485                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
486                 DPSOFTRAST_RecalcDepthFunc(thread);
487         }
488         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
489         {
490                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
491                 DPSOFTRAST_RecalcBlendFunc(thread);
492         }
493 }
494
495 static DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
496 {
497         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
498                 return &dpsoftrast.texture[index];
499         return NULL;
500 }
501
502 static void DPSOFTRAST_Texture_Grow(void)
503 {
504         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
505         DPSOFTRAST_State_Thread *thread;
506         int i;
507         int j;
508         DPSOFTRAST_Flush();
509         // expand texture array as needed
510         if (dpsoftrast.texture_max < 1024)
511                 dpsoftrast.texture_max = 1024;
512         else
513                 dpsoftrast.texture_max *= 2;
514         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
515         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
516                 if (dpsoftrast.texbound[i])
517                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
518         for (j = 0; j < dpsoftrast.numthreads; j++)
519         {
520                 thread = &dpsoftrast.threads[j];
521                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
522                         if (thread->texbound[i])
523                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
524         }
525 }
526
527 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
528 {
529         int w;
530         int h;
531         int d;
532         int size;
533         int s;
534         int texnum;
535         int mipmaps;
536         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
537         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
538         DPSOFTRAST_Texture *texture;
539         if (width*height*depth < 1)
540         {
541                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
542                 return 0;
543         }
544         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
545         {
546                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
547                 return 0;
548         }
549         switch(texformat)
550         {
551         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
552         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
553         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
554                 break;
555         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
556                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
557                 {
558                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
559                         return 0;
560                 }
561                 if (depth != 1)
562                 {
563                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
564                         return 0;
565                 }
566                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
567                 {
568                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
569                         return 0;
570                 }
571                 break;
572         }
573         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
574         {
575                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
576                 return 0;
577         }
578         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
579         {
580                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
581                 return 0;
582         }
583         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
584         {
585                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
586                 return 0;
587         }
588         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
589         {
590                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
591                 return 0;
592         }
593         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
594         {
595                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
596                 return 0;
597         }
598         // find first empty slot in texture array
599         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
600                 if (!dpsoftrast.texture[texnum].bytes)
601                         break;
602         dpsoftrast.texture_firstfree = texnum + 1;
603         if (dpsoftrast.texture_max <= texnum)
604                 DPSOFTRAST_Texture_Grow();
605         if (dpsoftrast.texture_end <= texnum)
606                 dpsoftrast.texture_end = texnum + 1;
607         texture = &dpsoftrast.texture[texnum];
608         memset(texture, 0, sizeof(*texture));
609         texture->flags = flags;
610         texture->width = width;
611         texture->height = height;
612         texture->depth = depth;
613         texture->sides = sides;
614         texture->binds = 0;
615         w = width;
616         h = height;
617         d = depth;
618         size = 0;
619         mipmaps = 0;
620         w = width;
621         h = height;
622         d = depth;
623         for (;;)
624         {
625                 s = w * h * d * sides * 4;
626                 texture->mipmap[mipmaps][0] = size;
627                 texture->mipmap[mipmaps][1] = s;
628                 texture->mipmap[mipmaps][2] = w;
629                 texture->mipmap[mipmaps][3] = h;
630                 texture->mipmap[mipmaps][4] = d;
631                 size += s;
632                 mipmaps++;
633                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
634                         break;
635                 if (w > 1) w >>= 1;
636                 if (h > 1) h >>= 1;
637                 if (d > 1) d >>= 1;
638         }
639         texture->mipmaps = mipmaps;
640         texture->size = size;
641
642         // allocate the pixels now
643         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
644
645         return texnum;
646 }
647 void DPSOFTRAST_Texture_Free(int index)
648 {
649         DPSOFTRAST_Texture *texture;
650         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
651         if (texture->binds)
652                 DPSOFTRAST_Flush();
653         if (texture->bytes)
654                 MM_FREE(texture->bytes);
655         texture->bytes = NULL;
656         memset(texture, 0, sizeof(*texture));
657         // adjust the free range and used range
658         if (dpsoftrast.texture_firstfree > index)
659                 dpsoftrast.texture_firstfree = index;
660         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
661                 dpsoftrast.texture_end--;
662 }
663 static void DPSOFTRAST_Texture_CalculateMipmaps(int index)
664 {
665         int i, x, y, z, w, layer0, layer1, row0, row1;
666         unsigned char *o, *i0, *i1, *i2, *i3;
667         DPSOFTRAST_Texture *texture;
668         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
669         if (texture->mipmaps <= 1)
670                 return;
671         for (i = 1;i < texture->mipmaps;i++)
672         {
673                 for (z = 0;z < texture->mipmap[i][4];z++)
674                 {
675                         layer0 = z*2;
676                         layer1 = z*2+1;
677                         if (layer1 >= texture->mipmap[i-1][4])
678                                 layer1 = texture->mipmap[i-1][4]-1;
679                         for (y = 0;y < texture->mipmap[i][3];y++)
680                         {
681                                 row0 = y*2;
682                                 row1 = y*2+1;
683                                 if (row1 >= texture->mipmap[i-1][3])
684                                         row1 = texture->mipmap[i-1][3]-1;
685                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
686                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
687                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
688                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
689                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
690                                 w = texture->mipmap[i][2];
691                                 if (layer1 > layer0)
692                                 {
693                                         if (texture->mipmap[i-1][2] > 1)
694                                         {
695                                                 // average 3D texture
696                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
697                                                 {
698                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
699                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
700                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
701                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
702                                                 }
703                                         }
704                                         else
705                                         {
706                                                 // average 3D mipmap with parent width == 1
707                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
708                                                 {
709                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
710                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
711                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
712                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
713                                                 }
714                                         }
715                                 }
716                                 else
717                                 {
718                                         if (texture->mipmap[i-1][2] > 1)
719                                         {
720                                                 // average 2D texture (common case)
721                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
722                                                 {
723                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
724                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
725                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
726                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
727                                                 }
728                                         }
729                                         else
730                                         {
731                                                 // 2D texture with parent width == 1
732                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
733                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
734                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
735                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
736                                         }
737                                 }
738                         }
739                 }
740         }
741 }
742 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
743 {
744         DPSOFTRAST_Texture *texture;
745         unsigned char *dst;
746         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
747         if (texture->binds)
748                 DPSOFTRAST_Flush();
749         if (pixels)
750         {
751                 dst = texture->bytes + texture->mipmap[0][1] +(-blocky * texture->mipmap[0][2] + blockx) * 4;
752                 while (blockheight > 0)
753                 {
754                         dst -= texture->mipmap[0][2] * 4;
755                         memcpy(dst, pixels, blockwidth * 4);
756                         pixels += blockwidth * 4;
757                         blockheight--;
758                 }
759         }
760         DPSOFTRAST_Texture_CalculateMipmaps(index);
761 }
762 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
763 {
764         DPSOFTRAST_Texture *texture;
765         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
766         if (texture->binds)
767                 DPSOFTRAST_Flush();
768         if (pixels)
769         {
770                 int i, stride = texture->mipmap[0][2]*4;
771                 unsigned char *dst = texture->bytes + texture->mipmap[0][1];
772                 for (i = texture->mipmap[0][3];i > 0;i--)
773                 {
774                         dst -= stride;
775                         memcpy(dst, pixels, stride);
776                         pixels += stride;
777                 }
778         }
779         DPSOFTRAST_Texture_CalculateMipmaps(index);
780 }
781 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
782 {
783         DPSOFTRAST_Texture *texture;
784         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
785         return texture->mipmap[mip][2];
786 }
787 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
788 {
789         DPSOFTRAST_Texture *texture;
790         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
791         return texture->mipmap[mip][3];
792 }
793 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
794 {
795         DPSOFTRAST_Texture *texture;
796         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
797         return texture->mipmap[mip][4];
798 }
799 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
800 {
801         DPSOFTRAST_Texture *texture;
802         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
803         if (texture->binds)
804                 DPSOFTRAST_Flush();
805         return texture->bytes + texture->mipmap[mip][0];
806 }
807 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
808 {
809         DPSOFTRAST_Texture *texture;
810         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
811         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
812         {
813                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
814                 return;
815         }
816         if (texture->binds)
817                 DPSOFTRAST_Flush();
818         texture->filter = filter;
819 }
820
821 static void DPSOFTRAST_Draw_FlushThreads(void);
822
823 static void DPSOFTRAST_Draw_SyncCommands(void)
824 {
825         if(dpsoftrast.usethreads) MEMORY_BARRIER;
826         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
827 }
828
829 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
830 {
831         DPSOFTRAST_State_Thread *thread;
832         int i;
833         int freecommand = dpsoftrast.commandpool.freecommand;
834         int usedcommands = dpsoftrast.commandpool.usedcommands;
835         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
836                 return;
837         DPSOFTRAST_Draw_SyncCommands();
838         for(;;)
839         {
840                 int waitindex = -1;
841                 int commandoffset;
842                 usedcommands = 0;
843                 for (i = 0; i < dpsoftrast.numthreads; i++)
844                 {
845                         thread = &dpsoftrast.threads[i]; 
846                         commandoffset = freecommand - thread->commandoffset;
847                         if (commandoffset < 0)
848                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
849                         if (commandoffset > usedcommands)
850                         {
851                                 waitindex = i;
852                                 usedcommands = commandoffset;
853                         }
854                 }
855                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
856                         break;
857                 thread = &dpsoftrast.threads[waitindex];
858                 Thread_LockMutex(thread->drawmutex);
859                 if (thread->commandoffset != dpsoftrast.drawcommand)
860                 {
861                         thread->waiting = true;
862                         if (thread->starving) Thread_CondSignal(thread->drawcond);
863                         Thread_CondWait(thread->waitcond, thread->drawmutex);
864                         thread->waiting = false;
865                 }
866                 Thread_UnlockMutex(thread->drawmutex);
867         }
868         dpsoftrast.commandpool.usedcommands = usedcommands;
869 }
870
871 #define DPSOFTRAST_ALIGNCOMMAND(size) \
872         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
873 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
874         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
875
876 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
877 {
878         DPSOFTRAST_Command *command;
879         int freecommand = dpsoftrast.commandpool.freecommand;
880         int usedcommands = dpsoftrast.commandpool.usedcommands;
881         int extra = sizeof(DPSOFTRAST_Command);
882         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
883                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
884         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
885         {
886                 if (dpsoftrast.usethreads)
887                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
888                 else
889                         DPSOFTRAST_Draw_FlushThreads();
890                 freecommand = dpsoftrast.commandpool.freecommand;
891                 usedcommands = dpsoftrast.commandpool.usedcommands;
892         }
893         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
894         {
895                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
896                 command->opcode = DPSOFTRAST_OPCODE_Reset;
897                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
898                 freecommand = 0;
899         }
900         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
901         command->opcode = opcode;
902         command->commandsize = size;
903         freecommand += size;
904         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
905                 freecommand = 0;
906         dpsoftrast.commandpool.freecommand = freecommand;
907         dpsoftrast.commandpool.usedcommands = usedcommands + size;
908         return command;
909 }
910
911 static void DPSOFTRAST_UndoCommand(int size)
912 {
913         int freecommand = dpsoftrast.commandpool.freecommand;
914         int usedcommands = dpsoftrast.commandpool.usedcommands;
915         freecommand -= size;
916         if (freecommand < 0)
917                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
918         usedcommands -= size;
919         dpsoftrast.commandpool.freecommand = freecommand;
920         dpsoftrast.commandpool.usedcommands = usedcommands;
921 }
922                 
923 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
924 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
925 {
926         thread->viewport[0] = command->x;
927         thread->viewport[1] = command->y;
928         thread->viewport[2] = command->width;
929         thread->viewport[3] = command->height;
930         thread->validate |= DPSOFTRAST_VALIDATE_FB;
931 }
932 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
933 {
934         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
935         command->x = x;
936         command->y = y;
937         command->width = width;
938         command->height = height;
939
940         dpsoftrast.viewport[0] = x;
941         dpsoftrast.viewport[1] = y;
942         dpsoftrast.viewport[2] = width;
943         dpsoftrast.viewport[3] = height;
944         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
945 }
946
947 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
948 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
949 {
950         int i, x1, y1, x2, y2, w, h, x, y;
951         int miny1, maxy1, miny2, maxy2;
952         int bandy;
953         unsigned int *p;
954         unsigned int c;
955         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
956         miny1 = thread->miny1;
957         maxy1 = thread->maxy1;
958         miny2 = thread->miny2;
959         maxy2 = thread->maxy2;
960         x1 = thread->fb_scissor[0];
961         y1 = thread->fb_scissor[1];
962         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
963         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
964         if (y1 < miny1) y1 = miny1;
965         if (y2 > maxy2) y2 = maxy2;
966         w = x2 - x1;
967         h = y2 - y1;
968         if (w < 1 || h < 1)
969                 return;
970         // FIXME: honor fb_colormask?
971         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
972         for (i = 0;i < 4;i++)
973         {
974                 if (!dpsoftrast.fb_colorpixels[i])
975                         continue;
976                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
977                 for (;y < bandy;y++)
978                 {
979                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
980                         for (x = x1;x < x2;x++)
981                                 p[x] = c;
982                 }
983         }
984 }
985 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
986 {
987         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
988         command->r = r;
989         command->g = g;
990         command->b = b;
991         command->a = a;
992 }
993
994 DEFCOMMAND(3, ClearDepth, float depth;)
995 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
996 {
997         int x1, y1, x2, y2, w, h, x, y;
998         int miny1, maxy1, miny2, maxy2;
999         int bandy;
1000         unsigned int *p;
1001         unsigned int c;
1002         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
1003         miny1 = thread->miny1;
1004         maxy1 = thread->maxy1;
1005         miny2 = thread->miny2;
1006         maxy2 = thread->maxy2;
1007         x1 = thread->fb_scissor[0];
1008         y1 = thread->fb_scissor[1];
1009         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
1010         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
1011         if (y1 < miny1) y1 = miny1;
1012         if (y2 > maxy2) y2 = maxy2;
1013         w = x2 - x1;
1014         h = y2 - y1;
1015         if (w < 1 || h < 1)
1016                 return;
1017         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
1018         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
1019         for (;y < bandy;y++)
1020         {
1021                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
1022                 for (x = x1;x < x2;x++)
1023                         p[x] = c;
1024         }
1025 }
1026 void DPSOFTRAST_ClearDepth(float d)
1027 {
1028         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1029         command->depth = d;
1030 }
1031
1032 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1033 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1034 {
1035         thread->colormask[0] = command->r != 0;
1036         thread->colormask[1] = command->g != 0;
1037         thread->colormask[2] = command->b != 0;
1038         thread->colormask[3] = command->a != 0;
1039         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1040 }
1041 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1042 {
1043         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1044         command->r = r;
1045         command->g = g;
1046         command->b = b;
1047         command->a = a;
1048 }
1049
1050 DEFCOMMAND(5, DepthTest, int enable;)
1051 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1052 {
1053         thread->depthtest = command->enable;
1054         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1055 }
1056 void DPSOFTRAST_DepthTest(int enable)
1057 {
1058         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1059         command->enable = enable;
1060 }
1061
1062 DEFCOMMAND(6, ScissorTest, int enable;)
1063 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1064 {
1065         thread->scissortest = command->enable;
1066         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1067 }
1068 void DPSOFTRAST_ScissorTest(int enable)
1069 {
1070         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1071         command->enable = enable;
1072 }
1073
1074 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1075 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1076 {
1077         thread->scissor[0] = command->x;
1078         thread->scissor[1] = command->y;
1079         thread->scissor[2] = command->width;
1080         thread->scissor[3] = command->height;
1081         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1082 }
1083 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1084 {
1085         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1086         command->x = x;
1087         command->y = y;
1088         command->width = width;
1089         command->height = height;
1090 }
1091
1092 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1093 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1094 {
1095         thread->blendfunc[0] = command->sfactor;
1096         thread->blendfunc[1] = command->dfactor;
1097         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1098 }
1099 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1100 {
1101         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1102         command->sfactor = sfactor;
1103         command->dfactor = dfactor;
1104 }
1105
1106 DEFCOMMAND(9, BlendSubtract, int enable;)
1107 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1108 {
1109         thread->blendsubtract = command->enable;
1110         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1111 }
1112 void DPSOFTRAST_BlendSubtract(int enable)
1113 {
1114         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1115         command->enable = enable;
1116 }
1117
1118 DEFCOMMAND(10, DepthMask, int enable;)
1119 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1120 {
1121         thread->depthmask = command->enable;
1122 }
1123 void DPSOFTRAST_DepthMask(int enable)
1124 {
1125         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1126         command->enable = enable;
1127 }
1128
1129 DEFCOMMAND(11, DepthFunc, int func;)
1130 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1131 {
1132         thread->depthfunc = command->func;
1133 }
1134 void DPSOFTRAST_DepthFunc(int func)
1135 {
1136         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1137         command->func = func;
1138 }
1139
1140 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1141 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1142 {
1143         thread->depthrange[0] = command->nearval;
1144         thread->depthrange[1] = command->farval;
1145 }
1146 void DPSOFTRAST_DepthRange(float nearval, float farval)
1147 {
1148         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1149         command->nearval = nearval;
1150         command->farval = farval;
1151 }
1152
1153 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1154 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1155 {
1156         thread->polygonoffset[0] = command->alongnormal;
1157         thread->polygonoffset[1] = command->intoview;
1158 }
1159 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1160 {
1161         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1162         command->alongnormal = alongnormal;
1163         command->intoview = intoview;
1164 }
1165
1166 DEFCOMMAND(14, CullFace, int mode;)
1167 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1168 {
1169         thread->cullface = command->mode;
1170 }
1171 void DPSOFTRAST_CullFace(int mode)
1172 {
1173         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1174         command->mode = mode;
1175 }
1176
1177 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1178 {
1179         dpsoftrast.color[0] = r;
1180         dpsoftrast.color[1] = g;
1181         dpsoftrast.color[2] = b;
1182         dpsoftrast.color[3] = a;
1183 }
1184
1185 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1186 {
1187         int outstride = blockwidth * 4;
1188         int instride = dpsoftrast.fb_width * 4;
1189         int bx1 = blockx;
1190         int by1 = blocky;
1191         int bx2 = blockx + blockwidth;
1192         int by2 = blocky + blockheight;
1193         int bw;
1194         int x;
1195         int y;
1196         unsigned char *inpixels;
1197         unsigned char *b;
1198         unsigned char *o;
1199         DPSOFTRAST_Flush();
1200         if (bx1 < 0) bx1 = 0;
1201         if (by1 < 0) by1 = 0;
1202         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1203         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1204         bw = bx2 - bx1;
1205         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1206         if (dpsoftrast.bigendian)
1207         {
1208                 for (y = by1;y < by2;y++)
1209                 {
1210                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1211                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1212                         for (x = bx1;x < bx2;x++)
1213                         {
1214                                 o[0] = b[3];
1215                                 o[1] = b[2];
1216                                 o[2] = b[1];
1217                                 o[3] = b[0];
1218                                 o += 4;
1219                                 b += 4;
1220                         }
1221                 }
1222         }
1223         else
1224         {
1225                 for (y = by1;y < by2;y++)
1226                 {
1227                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1228                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1229                         memcpy(o, b, bw*4);
1230                 }
1231         }
1232
1233 }
1234 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1235 {
1236         int tx1 = tx;
1237         int ty1 = ty;
1238         int tx2 = tx + width;
1239         int ty2 = ty + height;
1240         int sx1 = sx;
1241         int sy1 = sy;
1242         int sx2 = sx + width;
1243         int sy2 = sy + height;
1244         int swidth;
1245         int sheight;
1246         int twidth;
1247         int theight;
1248         int sw;
1249         int sh;
1250         int tw;
1251         int th;
1252         int y;
1253         unsigned int *spixels;
1254         unsigned int *tpixels;
1255         DPSOFTRAST_Texture *texture;
1256         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1257         if (mip < 0 || mip >= texture->mipmaps) return;
1258         DPSOFTRAST_Flush();
1259         spixels = dpsoftrast.fb_colorpixels[0];
1260         swidth = dpsoftrast.fb_width;
1261         sheight = dpsoftrast.fb_height;
1262         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1263         twidth = texture->mipmap[mip][2];
1264         theight = texture->mipmap[mip][3];
1265         if (tx1 < 0) tx1 = 0;
1266         if (ty1 < 0) ty1 = 0;
1267         if (tx2 > twidth) tx2 = twidth;
1268         if (ty2 > theight) ty2 = theight;
1269         if (sx1 < 0) sx1 = 0;
1270         if (sy1 < 0) sy1 = 0;
1271         if (sx2 > swidth) sx2 = swidth;
1272         if (sy2 > sheight) sy2 = sheight;
1273         tw = tx2 - tx1;
1274         th = ty2 - ty1;
1275         sw = sx2 - sx1;
1276         sh = sy2 - sy1;
1277         if (tw > sw) tw = sw;
1278         if (th > sh) th = sh;
1279         if (tw < 1 || th < 1)
1280                 return;
1281         sy1 = sheight - sy1 - th;
1282         ty1 = theight - ty1 - th;
1283         for (y = 0;y < th;y++)
1284                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1285         if (texture->mipmaps > 1)
1286                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1287 }
1288
1289 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1290 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1291 {
1292         if (thread->texbound[command->unitnum])
1293                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1294         thread->texbound[command->unitnum] = command->texture;
1295 }
1296 void DPSOFTRAST_SetTexture(int unitnum, int index)
1297 {
1298         DPSOFTRAST_Command_SetTexture *command;
1299         DPSOFTRAST_Texture *texture;
1300         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1301         {
1302                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1303                 return;
1304         }
1305         texture = DPSOFTRAST_Texture_GetByIndex(index);
1306         if (index && !texture)
1307         {
1308                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1309                 return;
1310         }
1311
1312         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1313         command->unitnum = unitnum;
1314         command->texture = texture;
1315
1316         dpsoftrast.texbound[unitnum] = texture;
1317         if (texture)
1318                 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1319 }
1320
1321 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1322 {
1323         dpsoftrast.pointer_vertex3f = vertex3f;
1324         dpsoftrast.stride_vertex = stride;
1325 }
1326 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1327 {
1328         dpsoftrast.pointer_color4f = color4f;
1329         dpsoftrast.pointer_color4ub = NULL;
1330         dpsoftrast.stride_color = stride;
1331 }
1332 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1333 {
1334         dpsoftrast.pointer_color4f = NULL;
1335         dpsoftrast.pointer_color4ub = color4ub;
1336         dpsoftrast.stride_color = stride;
1337 }
1338 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1339 {
1340         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1341         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1342         dpsoftrast.stride_texcoord[unitnum] = stride;
1343 }
1344
1345 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1346 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1347 {
1348         thread->shader_mode = command->mode;
1349         thread->shader_permutation = command->permutation;
1350         thread->shader_exactspecularmath = command->exactspecularmath;
1351 }
1352 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1353 {
1354         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1355         command->mode = mode;
1356         command->permutation = permutation;
1357         command->exactspecularmath = exactspecularmath;
1358
1359         dpsoftrast.shader_mode = mode;
1360         dpsoftrast.shader_permutation = permutation;
1361         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1362 }
1363
1364 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1365 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1366 {
1367         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1368 }
1369 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1370 {
1371         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1372         command->index = index;
1373         command->val[0] = v0;
1374         command->val[1] = v1;
1375         command->val[2] = v2;
1376         command->val[3] = v3;
1377
1378         dpsoftrast.uniform4f[index*4+0] = v0;
1379         dpsoftrast.uniform4f[index*4+1] = v1;
1380         dpsoftrast.uniform4f[index*4+2] = v2;
1381         dpsoftrast.uniform4f[index*4+3] = v3;
1382 }
1383 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1384 {
1385         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1386         command->index = index;
1387         memcpy(command->val, v, sizeof(command->val));
1388
1389         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1390 }
1391
1392 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1393 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1394 {
1395         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1396 }
1397 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1398 {
1399 #ifdef SSE_POSSIBLE
1400         int i, index;
1401         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1402         {
1403                 __m128 m0, m1, m2, m3;
1404                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1405                 command->index = (DPSOFTRAST_UNIFORM)index;
1406                 if (((size_t)v)&(ALIGN_SIZE-1))
1407                 {
1408                         m0 = _mm_loadu_ps(v);
1409                         m1 = _mm_loadu_ps(v+4);
1410                         m2 = _mm_loadu_ps(v+8);
1411                         m3 = _mm_loadu_ps(v+12);
1412                 }
1413                 else
1414                 {
1415                         m0 = _mm_load_ps(v);
1416                         m1 = _mm_load_ps(v+4);
1417                         m2 = _mm_load_ps(v+8);
1418                         m3 = _mm_load_ps(v+12);
1419                 }
1420                 if (transpose)
1421                 {
1422                         __m128 t0, t1, t2, t3;
1423                         t0 = _mm_unpacklo_ps(m0, m1);
1424                         t1 = _mm_unpacklo_ps(m2, m3);
1425                         t2 = _mm_unpackhi_ps(m0, m1);
1426                         t3 = _mm_unpackhi_ps(m2, m3);
1427                         m0 = _mm_movelh_ps(t0, t1);
1428                         m1 = _mm_movehl_ps(t1, t0);
1429                         m2 = _mm_movelh_ps(t2, t3);
1430                         m3 = _mm_movehl_ps(t3, t2);                     
1431                 }
1432                 _mm_store_ps(command->val, m0);
1433                 _mm_store_ps(command->val+4, m1);
1434                 _mm_store_ps(command->val+8, m2);
1435                 _mm_store_ps(command->val+12, m3);
1436                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1437                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1438                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1439                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1440         }
1441 #endif
1442 }
1443
1444 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1445 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1446 {
1447         thread->uniform1i[command->index] = command->val;
1448 }
1449 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1450 {
1451         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1452         command->index = index;
1453         command->val = i0;
1454
1455         dpsoftrast.uniform1i[command->index] = i0;
1456 }
1457
1458 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1459 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1460 {
1461         memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1462         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1463 }
1464 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1465 {
1466         DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1467         command->clipplane[0] = x;
1468         command->clipplane[1] = y;
1469         command->clipplane[2] = z;
1470         command->clipplane[3] = w;
1471 }
1472
1473 #ifdef SSE_POSSIBLE
1474 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1475 {
1476         float *end = dst + size*4;
1477         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1478         {
1479                 while (dst < end)
1480                 {
1481                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1482                         dst += 4;
1483                         src += stride;
1484                 }
1485         }
1486         else
1487         {
1488                 while (dst < end)
1489                 {
1490                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1491                         dst += 4;
1492                         src += stride;
1493                 }
1494         }
1495 }
1496
1497 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1498 {
1499         float *end = dst + size*4;
1500         if (stride == sizeof(float[3]))
1501         {
1502                 float *end4 = dst + (size&~3)*4;        
1503                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1504                 {
1505                         while (dst < end4)
1506                         {
1507                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1508                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1509                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1510                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1511                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1512                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1513                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1514                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1515                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1516                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1517                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1518                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1519                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1520                                 dst += 16;
1521                                 src += 4*sizeof(float[3]);
1522                         }
1523                 }
1524                 else
1525                 {
1526                         while (dst < end4)
1527                         {
1528                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1529                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1530                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1531                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1532                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1533                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1534                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1535                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1536                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1537                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1538                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1539                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1540                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1541                                 dst += 16;
1542                                 src += 4*sizeof(float[3]);
1543                         }
1544                 }
1545         }
1546         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1547         {
1548                 while (dst < end)
1549                 {
1550                         __m128 v = _mm_loadu_ps((const float *)src);
1551                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1552                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1553                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1554                         _mm_store_ps(dst, v);
1555                         dst += 4;
1556                         src += stride;
1557                 }
1558         }
1559         else
1560         {
1561                 while (dst < end)
1562                 {
1563                         __m128 v = _mm_load_ps((const float *)src);
1564                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1565                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1566                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1567                         _mm_store_ps(dst, v);
1568                         dst += 4;
1569                         src += stride;
1570                 }
1571         }
1572 }
1573
1574 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1575 {
1576         float *end = dst + size*4;
1577         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1578         if (stride == sizeof(float[2]))
1579         {
1580                 float *end2 = dst + (size&~1)*4;
1581                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1582                 {
1583                         while (dst < end2)
1584                         {
1585                                 __m128 v = _mm_loadu_ps((const float *)src);
1586                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1587                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1588                                 dst += 8;
1589                                 src += 2*sizeof(float[2]);
1590                         }
1591                 }
1592                 else
1593                 {
1594                         while (dst < end2)
1595                         {
1596                                 __m128 v = _mm_load_ps((const float *)src);
1597                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1598                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1599                                 dst += 8;
1600                                 src += 2*sizeof(float[2]);
1601                         }
1602                 }
1603         }
1604         while (dst < end)
1605         {
1606                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1607                 dst += 4;
1608                 src += stride;
1609         }
1610 }
1611
1612 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1613 {
1614         float *end = dst + size*4;
1615         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1616         if (stride == sizeof(unsigned char[4]))
1617         {
1618                 float *end4 = dst + (size&~3)*4;
1619                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1620                 {
1621                         while (dst < end4)
1622                         {
1623                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1624                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1625                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1626                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1627                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1628                                 dst += 16;
1629                                 src += 4*sizeof(unsigned char[4]);
1630                         }
1631                 }
1632                 else
1633                 {
1634                         while (dst < end4)
1635                         {
1636                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1637                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1638                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1639                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1640                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1641                                 dst += 16;
1642                                 src += 4*sizeof(unsigned char[4]);
1643                         }
1644                 }
1645         }
1646         while (dst < end)
1647         {
1648                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1649                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1650                 dst += 4;
1651                 src += stride;
1652         }
1653 }
1654
1655 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1656 {
1657         float *end = dst + 4*size;
1658         __m128 v = _mm_loadu_ps(src);
1659         while (dst < end)
1660         {
1661                 _mm_store_ps(dst, v);
1662                 dst += 4;
1663         }
1664 }
1665 #endif
1666
1667 static void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1668 {
1669 #ifdef SSE_POSSIBLE
1670         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1671         __m128 m0, m1, m2, m3;
1672         float *end;
1673         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1674         {
1675                 // fast case for identity matrix
1676                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1677                 return;
1678         }
1679         end = out4f + numitems*4;
1680         m0 = _mm_loadu_ps(inmatrix16f);
1681         m1 = _mm_loadu_ps(inmatrix16f + 4);
1682         m2 = _mm_loadu_ps(inmatrix16f + 8);
1683         m3 = _mm_loadu_ps(inmatrix16f + 12);
1684         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1685         {
1686                 while (out4f < end)
1687                 {
1688                         __m128 v = _mm_loadu_ps(in4f);
1689                         _mm_store_ps(out4f,
1690                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1691                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1692                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1693                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1694                         out4f += 4;
1695                         in4f += 4;
1696                 }
1697         }
1698         else
1699         {
1700                 while (out4f < end)
1701                 {
1702                         __m128 v = _mm_load_ps(in4f);
1703                         _mm_store_ps(out4f,
1704                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1705                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1706                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1707                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1708                         out4f += 4;
1709                         in4f += 4;
1710                 }
1711         }
1712 #endif
1713 }
1714
1715 #if 0
1716 static void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1717 {
1718         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1719 }
1720 #endif
1721
1722 #ifdef SSE_POSSIBLE
1723 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1724 { \
1725         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1726         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1727         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1728         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1729 }
1730
1731 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1732 { \
1733         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1734         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1735         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1736         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1737 }
1738
1739 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1740 { \
1741         __m128 p = (in); \
1742         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1743                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1744                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1745                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1746 }
1747
1748 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1749 {
1750         int clipmask = 0xFF;
1751         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1752         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1753         __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1754         __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1755         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1756         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1757         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1758         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1759         #define BBFRONT(k, pos) \
1760         { \
1761                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1762                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1763                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1764                 { \
1765                         __m128 proj; \
1766                         clipmask &= ~(1<<k); \
1767                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1768                         minproj = _mm_min_ss(minproj, proj); \
1769                         maxproj = _mm_max_ss(maxproj, proj); \
1770                 } \
1771         }
1772         BBFRONT(0, minpos); 
1773         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1774         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1775         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1776         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1777         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1778         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1779         BBFRONT(7, maxpos);
1780         #define BBCLIP(k) \
1781         { \
1782                 if (clipmask&(1<<k)) \
1783                 { \
1784                         if (!(clipmask&(1<<(k^1)))) \
1785                         { \
1786                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1787                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1788                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1789                                 minproj = _mm_min_ss(minproj, proj); \
1790                                 maxproj = _mm_max_ss(maxproj, proj); \
1791                         } \
1792                         if (!(clipmask&(1<<(k^2)))) \
1793                         { \
1794                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1795                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1796                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1797                                 minproj = _mm_min_ss(minproj, proj); \
1798                                 maxproj = _mm_max_ss(maxproj, proj); \
1799                         } \
1800                         if (!(clipmask&(1<<(k^4)))) \
1801                         { \
1802                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1803                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1804                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1805                                 minproj = _mm_min_ss(minproj, proj); \
1806                                 maxproj = _mm_max_ss(maxproj, proj); \
1807                         } \
1808                 } \
1809         }
1810         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1811         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1812         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1813         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1814         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1815         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1816         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1817         *starty = _mm_cvttss_si32(maxproj);
1818         *endy = _mm_cvttss_si32(minproj)+1;
1819         return clipmask;
1820 }
1821         
1822 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1823 {
1824         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1825         float *end = out4f + numitems*4;
1826         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1827         __m128 minpos, maxpos;
1828         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1829         {
1830                 minpos = maxpos = _mm_loadu_ps(in4f);
1831                 while (out4f < end)
1832                 {
1833                         __m128 v = _mm_loadu_ps(in4f);
1834                         minpos = _mm_min_ps(minpos, v);
1835                         maxpos = _mm_max_ps(maxpos, v);
1836                         _mm_store_ps(out4f, v);
1837                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1838                         _mm_store_ps(screen4f, v);
1839                         in4f += 4;
1840                         out4f += 4;
1841                         screen4f += 4;
1842                 }
1843         }
1844         else
1845         {
1846                 minpos = maxpos = _mm_load_ps(in4f);
1847                 while (out4f < end)
1848                 {
1849                         __m128 v = _mm_load_ps(in4f);
1850                         minpos = _mm_min_ps(minpos, v);
1851                         maxpos = _mm_max_ps(maxpos, v);
1852                         _mm_store_ps(out4f, v);
1853                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1854                         _mm_store_ps(screen4f, v);
1855                         in4f += 4;
1856                         out4f += 4;
1857                         screen4f += 4;
1858                 }
1859         }
1860         if (starty && endy) 
1861         {
1862                 ALIGN(float minposf[4]);
1863                 ALIGN(float maxposf[4]);
1864                 _mm_store_ps(minposf, minpos);
1865                 _mm_store_ps(maxposf, maxpos);
1866                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1867         }
1868         return 0;
1869 }
1870
1871 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1872 {
1873         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1874         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1875         float *end;
1876         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1877                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1878         end = out4f + numitems*4;
1879         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1880         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1881         m0 = _mm_loadu_ps(inmatrix16f);
1882         m1 = _mm_loadu_ps(inmatrix16f + 4);
1883         m2 = _mm_loadu_ps(inmatrix16f + 8);
1884         m3 = _mm_loadu_ps(inmatrix16f + 12);
1885         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1886         {
1887                 minpos = maxpos = _mm_loadu_ps(in4f);
1888                 while (out4f < end)
1889                 {
1890                         __m128 v = _mm_loadu_ps(in4f);
1891                         minpos = _mm_min_ps(minpos, v);
1892                         maxpos = _mm_max_ps(maxpos, v);
1893                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1894                         _mm_store_ps(out4f, v);
1895                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1896                         _mm_store_ps(screen4f, v);
1897                         in4f += 4;
1898                         out4f += 4;
1899                         screen4f += 4;
1900                 }
1901         }
1902         else
1903         {
1904                 minpos = maxpos = _mm_load_ps(in4f);
1905                 while (out4f < end)
1906                 {
1907                         __m128 v = _mm_load_ps(in4f);
1908                         minpos = _mm_min_ps(minpos, v);
1909                         maxpos = _mm_max_ps(maxpos, v);
1910                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1911                         _mm_store_ps(out4f, v);
1912                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1913                         _mm_store_ps(screen4f, v);
1914                         in4f += 4;
1915                         out4f += 4;
1916                         screen4f += 4;
1917                 }
1918         }
1919         if (starty && endy) 
1920         {
1921                 ALIGN(float minposf[4]);
1922                 ALIGN(float maxposf[4]);
1923                 _mm_store_ps(minposf, minpos);
1924                 _mm_store_ps(maxposf, maxpos);
1925                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f); 
1926         }
1927         return 0;
1928 }
1929 #endif
1930
1931 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1932 {
1933 #ifdef SSE_POSSIBLE
1934         float *outf = dpsoftrast.post_array4f[outarray];
1935         const unsigned char *inb;
1936         int firstvertex = dpsoftrast.firstvertex;
1937         int numvertices = dpsoftrast.numvertices;
1938         int stride;
1939         switch(inarray)
1940         {
1941         case DPSOFTRAST_ARRAY_POSITION:
1942                 stride = dpsoftrast.stride_vertex;
1943                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1944                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1945                 break;
1946         case DPSOFTRAST_ARRAY_COLOR:
1947                 stride = dpsoftrast.stride_color;
1948                 if (dpsoftrast.pointer_color4f)
1949                 {
1950                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1951                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1952                 }
1953                 else if (dpsoftrast.pointer_color4ub)
1954                 {
1955                         stride = dpsoftrast.stride_color;
1956                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1957                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1958                 }
1959                 else
1960                 {
1961                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1962                 }
1963                 break;
1964         default:
1965                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1966                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1967                 {
1968                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1969                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1970                         {
1971                         case 2:
1972                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1973                                 break;
1974                         case 3:
1975                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1976                                 break;
1977                         case 4:
1978                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1979                                 break;
1980                         }
1981                 }
1982                 break;
1983         }
1984         return outf;
1985 #else
1986         return NULL;
1987 #endif
1988 }
1989
1990 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1991 {
1992         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1993         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1994         return data;
1995 }
1996
1997 #if 0
1998 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1999 {
2000 #ifdef SSE_POSSIBLE
2001         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2002         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
2003         return data;
2004 #else
2005         return NULL;
2006 #endif
2007 }
2008 #endif
2009
2010 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2011 {
2012 #ifdef SSE_POSSIBLE
2013         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2014         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2015         return data;
2016 #else
2017         return NULL;
2018 #endif
2019 }
2020
2021 static void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2022 {
2023         int x;
2024         int startx = span->startx;
2025         int endx = span->endx;
2026         float wslope = triangle->w[0];
2027         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2028         float endz = 1.0f / (w + wslope * startx);
2029         if (triangle->w[0] == 0)
2030         {
2031                 // LordHavoc: fast flat polygons (HUD/menu)
2032                 for (x = startx;x < endx;x++)
2033                         zf[x] = endz;
2034                 return;
2035         }
2036         for (x = startx;x < endx;)
2037         {
2038                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2039                 float z = endz, dz;
2040                 if (nextsub >= endx) nextsub = endsub = endx-1;
2041                 endz = 1.0f / (w + wslope * nextsub);
2042                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2043                 for (; x <= endsub; x++, z += dz)
2044                         zf[x] = z;
2045         }
2046 }
2047
2048 static void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2049 {
2050 #ifdef SSE_POSSIBLE
2051         int x;
2052         int startx = span->startx;
2053         int endx = span->endx;
2054         int maskx;
2055         int subx;
2056         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2057         unsigned char * RESTRICT pixelmask = span->pixelmask;
2058         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2059         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2060         if (!pixel)
2061                 return;
2062         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2063         pixeli += span->y * dpsoftrast.fb_width + span->x;
2064         // handle alphatest now (this affects depth writes too)
2065         if (thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL)
2066                 for (x = startx;x < endx;x++)
2067                         if (in4ub[x*4+3] < 128)
2068                                 pixelmask[x] = false;
2069         // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2070         // helps sprites, text and hud artwork
2071         switch(thread->fb_blendmode)
2072         {
2073         case DPSOFTRAST_BLENDMODE_ALPHA:
2074         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2075         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2076                 maskx = startx;
2077                 for (x = startx;x < endx;x++)
2078                 {
2079                         if (in4ub[x*4+3] >= 1)
2080                         {
2081                                 startx = x;
2082                                 for (;;)
2083                                 {
2084                                         while (++x < endx && in4ub[x*4+3] >= 1) ;
2085                                         maskx = x;
2086                                         if (x >= endx) break;
2087                                         ++x;
2088                                         while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2089                                         if (x >= endx) break;
2090                                 }
2091                                 break;
2092                         }
2093                 }
2094                 endx = maskx;
2095                 break;
2096         case DPSOFTRAST_BLENDMODE_OPAQUE:
2097         case DPSOFTRAST_BLENDMODE_ADD:
2098         case DPSOFTRAST_BLENDMODE_INVMOD:
2099         case DPSOFTRAST_BLENDMODE_MUL:
2100         case DPSOFTRAST_BLENDMODE_MUL2:
2101         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2102         case DPSOFTRAST_BLENDMODE_INVADD:
2103                 break;
2104         }
2105         // put some special values at the end of the mask to ensure the loops end
2106         pixelmask[endx] = 1;
2107         pixelmask[endx+1] = 0;
2108         // LordHavoc: use a double loop to identify subspans, this helps the
2109         // optimized copy/blend loops to perform at their best, most triangles
2110         // have only one run of pixels, and do the search using wide reads...
2111         x = startx;
2112         while (x < endx)
2113         {
2114                 // if this pixel is masked off, it's probably not alone...
2115                 if (!pixelmask[x])
2116                 {
2117                         x++;
2118 #if 1
2119                         if (x + 8 < endx)
2120                         {
2121                                 // the 4-item search must be aligned or else it stalls badly
2122                                 if ((x & 3) && !pixelmask[x]) 
2123                                 {
2124                                         if(pixelmask[x]) goto endmasked;
2125                                         x++;
2126                                         if (x & 3)
2127                                         {
2128                                                 if(pixelmask[x]) goto endmasked;
2129                                                 x++;
2130                                                 if (x & 3)
2131                                                 {
2132                                                         if(pixelmask[x]) goto endmasked;
2133                                                         x++;
2134                                                 }
2135                                         }
2136                                 }
2137                                 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2138                                         x += 4;
2139                         }
2140 #endif
2141                         for (;!pixelmask[x];x++)
2142                                 ;
2143                         // rather than continue the loop, just check the end variable
2144                         if (x >= endx)
2145                                 break;
2146                 }
2147         endmasked:
2148                 // find length of subspan
2149                 subx = x + 1;
2150 #if 1
2151                 if (subx + 8 < endx)
2152                 {
2153                         if (subx & 3)
2154                         {
2155                                 if(!pixelmask[subx]) goto endunmasked;
2156                                 subx++;
2157                                 if (subx & 3)
2158                                 {
2159                                         if(!pixelmask[subx]) goto endunmasked;
2160                                         subx++;
2161                                         if (subx & 3)
2162                                         {
2163                                                 if(!pixelmask[subx]) goto endunmasked;
2164                                                 subx++;
2165                                         }
2166                                 }
2167                         }
2168                         while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2169                                 subx += 4;
2170                 }
2171 #endif
2172                 for (;pixelmask[subx];subx++)
2173                         ;
2174                 // the checks can overshoot, so make sure to clip it...
2175                 if (subx > endx)
2176                         subx = endx;
2177         endunmasked:
2178                 // now that we know the subspan length...  process!
2179                 switch(thread->fb_blendmode)
2180                 {
2181                 case DPSOFTRAST_BLENDMODE_OPAQUE:
2182 #if 0
2183                         if (subx - x >= 16)
2184                         {
2185                                 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2186                                 x = subx;
2187                         }
2188                         else
2189 #elif 1
2190                         while (x + 16 <= subx)
2191                         {
2192                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2193                                 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2194                                 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2195                                 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2196                                 x += 16;
2197                         }
2198 #endif
2199                         {
2200                                 while (x + 4 <= subx)
2201                                 {
2202                                         _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2203                                         x += 4;
2204                                 }
2205                                 if (x + 2 <= subx)
2206                                 {
2207                                         pixeli[x] = ini[x];
2208                                         pixeli[x+1] = ini[x+1];
2209                                         x += 2;
2210                                 }
2211                                 if (x < subx)
2212                                 {
2213                                         pixeli[x] = ini[x];
2214                                         x++;
2215                                 }
2216                         }
2217                         break;
2218                 case DPSOFTRAST_BLENDMODE_ALPHA:
2219                 #define FINISHBLEND(blend2, blend1) \
2220                         for (;x + 1 < subx;x += 2) \
2221                         { \
2222                                 __m128i src, dst; \
2223                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2224                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2225                                 blend2; \
2226                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2227                         } \
2228                         if (x < subx) \
2229                         { \
2230                                 __m128i src, dst; \
2231                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2232                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2233                                 blend1; \
2234                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2235                                 x++; \
2236                         }
2237                         FINISHBLEND({
2238                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2239                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2240                         }, {
2241                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2242                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2243                         });
2244                         break;
2245                 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2246                         FINISHBLEND({
2247                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2248                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2249                         }, {
2250                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2251                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2252                         });
2253                         break;
2254                 case DPSOFTRAST_BLENDMODE_ADD:
2255                         FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2256                         break;
2257                 case DPSOFTRAST_BLENDMODE_INVMOD:
2258                         FINISHBLEND({
2259                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2260                         }, {
2261                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2262                         });
2263                         break;
2264                 case DPSOFTRAST_BLENDMODE_MUL:
2265                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2266                         break;
2267                 case DPSOFTRAST_BLENDMODE_MUL2:
2268                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2269                         break;
2270                 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2271                         FINISHBLEND({
2272                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2273                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2274                         }, {
2275                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2276                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2277                         });
2278                         break;
2279                 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2280                         FINISHBLEND({
2281                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2282                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2283                         }, {
2284                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2285                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2286                         });
2287                         break;
2288                 case DPSOFTRAST_BLENDMODE_INVADD:
2289                         FINISHBLEND({
2290                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2291                         }, {
2292                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2293                         });
2294                         break;
2295                 }
2296         }
2297 #endif
2298 }
2299
2300 static void DPSOFTRAST_Texture2DBGRA8(DPSOFTRAST_Texture *texture, int mip, float x, float y, unsigned char c[4])
2301         // warning: this is SLOW, only use if the optimized per-span functions won't do
2302 {
2303         const unsigned char * RESTRICT pixelbase;
2304         const unsigned char * RESTRICT pixel[4];
2305         int width = texture->mipmap[mip][2], height = texture->mipmap[mip][3];
2306         int wrapmask[2] = { width-1, height-1 };
2307         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*width;
2308         if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
2309         {
2310                 unsigned int tc[2] = { x * (width<<12) - 2048, y * (height<<12) - 2048};
2311                 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2312                 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2313                 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2314                 int tci[2] = { tc[0]>>12, tc[1]>>12 };
2315                 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2316                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2317                 {
2318                         tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2319                         tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2320                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= wrapmask[0] ? tci1[0] : wrapmask[0]) : 0;
2321                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= wrapmask[1] ? tci1[1] : wrapmask[1]) : 0;
2322                 }
2323                 else
2324                 {
2325                         tci[0] &= wrapmask[0];
2326                         tci[1] &= wrapmask[1];
2327                         tci1[0] &= wrapmask[0];
2328                         tci1[1] &= wrapmask[1];
2329                 }
2330                 pixel[0] = pixelbase + 4 * (tci[0] - tci[1]*width);
2331                 pixel[1] = pixelbase + 4 * (tci[0] - tci[1]*width);
2332                 pixel[2] = pixelbase + 4 * (tci[0] - tci1[1]*width);
2333                 pixel[3] = pixelbase + 4 * (tci[0] - tci1[1]*width);
2334                 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
2335                 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
2336                 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
2337                 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
2338         }
2339         else
2340         {
2341                 int tci[2] = { x * width, y * height };
2342                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2343                 {
2344                         tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2345                         tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2346                 }
2347                 else
2348                 {
2349                         tci[0] &= wrapmask[0];
2350                         tci[1] &= wrapmask[1];
2351                 }
2352                 pixel[0] = pixelbase + 4 * (tci[0] - tci[1]*width);
2353                 c[0] = pixel[0][0];
2354                 c[1] = pixel[0][1];
2355                 c[2] = pixel[0][2];
2356                 c[3] = pixel[0][3];
2357         }
2358 }
2359
2360 #if 0
2361 static void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2362 {
2363         int x;
2364         int startx = span->startx;
2365         int endx = span->endx;
2366         int flags;
2367         float c[4];
2368         float data[4];
2369         float slope[4];
2370         float tc[2], endtc[2];
2371         float tcscale[2];
2372         unsigned int tci[2];
2373         unsigned int tci1[2];
2374         unsigned int tcimin[2];
2375         unsigned int tcimax[2];
2376         int tciwrapmask[2];
2377         int tciwidth;
2378         int filter;
2379         int mip;
2380         const unsigned char * RESTRICT pixelbase;
2381         const unsigned char * RESTRICT pixel[4];
2382         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2383         // if no texture is bound, just fill it with white
2384         if (!texture)
2385         {
2386                 for (x = startx;x < endx;x++)
2387                 {
2388                         out4f[x*4+0] = 1.0f;
2389                         out4f[x*4+1] = 1.0f;
2390                         out4f[x*4+2] = 1.0f;
2391                         out4f[x*4+3] = 1.0f;
2392                 }
2393                 return;
2394         }
2395         mip = triangle->mip[texunitindex];
2396         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*texture->mipmap[mip][2];
2397         // if this mipmap of the texture is 1 pixel, just fill it with that color
2398         if (texture->mipmap[mip][1] == 4)
2399         {
2400                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2401                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2402                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2403                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2404                 for (x = startx;x < endx;x++)
2405                 {
2406                         out4f[x*4+0] = c[0];
2407                         out4f[x*4+1] = c[1];
2408                         out4f[x*4+2] = c[2];
2409                         out4f[x*4+3] = c[3];
2410                 }
2411                 return;
2412         }
2413         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2414         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2415         flags = texture->flags;
2416         tcscale[0] = texture->mipmap[mip][2];
2417         tcscale[1] = texture->mipmap[mip][3];
2418         tciwidth = -texture->mipmap[mip][2];
2419         tcimin[0] = 0;
2420         tcimin[1] = 0;
2421         tcimax[0] = texture->mipmap[mip][2]-1;
2422         tcimax[1] = texture->mipmap[mip][3]-1;
2423         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2424         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2425         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2426         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2427         if (filter)
2428         {
2429                 endtc[0] -= 0.5f;
2430                 endtc[1] -= 0.5f;
2431         }
2432         for (x = startx;x < endx;)
2433         {
2434                 unsigned int subtc[2];
2435                 unsigned int substep[2];
2436                 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2437                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2438                 if (nextsub >= endx)
2439                 {
2440                         nextsub = endsub = endx-1;      
2441                         if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2442                 }
2443                 tc[0] = endtc[0];
2444                 tc[1] = endtc[1];
2445                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2446                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2447                 if (filter)
2448                 {
2449                         endtc[0] -= 0.5f;
2450                         endtc[1] -= 0.5f;
2451                 }
2452                 substep[0] = (endtc[0] - tc[0]) * subscale;
2453                 substep[1] = (endtc[1] - tc[1]) * subscale;
2454                 subtc[0] = tc[0] * (1<<12);
2455                 subtc[1] = tc[1] * (1<<12);
2456                 if (filter)
2457                 {
2458                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2459                         {
2460                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2461                                 {
2462                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2463                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2464                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2465                                         tci[0] = subtc[0]>>12;
2466                                         tci[1] = subtc[1]>>12;
2467                                         tci1[0] = tci[0] + 1;
2468                                         tci1[1] = tci[1] + 1;
2469                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2470                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2471                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2472                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2473                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2474                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2475                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2476                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2477                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2478                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2479                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2480                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2481                                         out4f[x*4+0] = c[0];
2482                                         out4f[x*4+1] = c[1];
2483                                         out4f[x*4+2] = c[2];
2484                                         out4f[x*4+3] = c[3];
2485                                 }
2486                         }
2487                         else
2488                         {
2489                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2490                                 {
2491                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2492                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2493                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2494                                         tci[0] = subtc[0]>>12;
2495                                         tci[1] = subtc[1]>>12;
2496                                         tci1[0] = tci[0] + 1;
2497                                         tci1[1] = tci[1] + 1;
2498                                         tci[0] &= tciwrapmask[0];
2499                                         tci[1] &= tciwrapmask[1];
2500                                         tci1[0] &= tciwrapmask[0];
2501                                         tci1[1] &= tciwrapmask[1];
2502                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2503                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2504                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2505                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2506                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2507                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2508                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2509                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2510                                         out4f[x*4+0] = c[0];
2511                                         out4f[x*4+1] = c[1];
2512                                         out4f[x*4+2] = c[2];
2513                                         out4f[x*4+3] = c[3];
2514                                 }
2515                         }
2516                 }
2517                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2518                 {
2519                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2520                         {
2521                                 tci[0] = subtc[0]>>12;
2522                                 tci[1] = subtc[1]>>12;
2523                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2524                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2525                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2526                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2527                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2528                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2529                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2530                                 out4f[x*4+0] = c[0];
2531                                 out4f[x*4+1] = c[1];
2532                                 out4f[x*4+2] = c[2];
2533                                 out4f[x*4+3] = c[3];
2534                         }
2535                 }
2536                 else
2537                 {
2538                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2539                         {
2540                                 tci[0] = subtc[0]>>12;
2541                                 tci[1] = subtc[1]>>12;
2542                                 tci[0] &= tciwrapmask[0];
2543                                 tci[1] &= tciwrapmask[1];
2544                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2545                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2546                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2547                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2548                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2549                                 out4f[x*4+0] = c[0];
2550                                 out4f[x*4+1] = c[1];
2551                                 out4f[x*4+2] = c[2];
2552                                 out4f[x*4+3] = c[3];
2553                         }
2554                 }
2555         }
2556 }
2557 #endif
2558
2559 static void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2560 {
2561 #ifdef SSE_POSSIBLE
2562         int x;
2563         int startx = span->startx;
2564         int endx = span->endx;
2565         int flags;
2566         __m128 data, slope, tcscale;
2567         __m128i tcsize, tcmask, tcoffset, tcmax;
2568         __m128 tc, endtc;
2569         __m128i subtc, substep, endsubtc;
2570         int filter;
2571         int mip;
2572         int affine; // LordHavoc: optimized affine texturing case
2573         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2574         const unsigned char * RESTRICT pixelbase;
2575         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2576         // if no texture is bound, just fill it with white
2577         if (!texture)
2578         {
2579                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2580                 return;
2581         }
2582         mip = triangle->mip[texunitindex];
2583         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*texture->mipmap[mip][2];
2584         // if this mipmap of the texture is 1 pixel, just fill it with that color
2585         if (texture->mipmap[mip][1] == 4)
2586         {
2587                 unsigned int k = *((const unsigned int *)pixelbase);
2588                 for (x = startx;x < endx;x++)
2589                         outi[x] = k;
2590                 return;
2591         }
2592         affine = zf[startx] == zf[endx-1];
2593         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2594         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2595         flags = texture->flags;
2596         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2597         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2598         tcscale = _mm_cvtepi32_ps(tcsize);
2599         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2600         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2601         endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2602         if (filter)
2603                 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2604         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2605         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_sub_epi32(_mm_setzero_si128(), _mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0))), 18), _mm_set1_epi32(4));
2606         tcmax = _mm_packs_epi32(tcmask, tcmask);
2607         for (x = startx;x < endx;)
2608         {
2609                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2610                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2611                 if (nextsub >= endx || affine)
2612                 {
2613                         nextsub = endsub = endx-1;
2614                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2615                 }       
2616                 tc = endtc;
2617                 subtc = endsubtc;
2618                 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2619                 if (filter)
2620                         endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2621                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2622                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2623                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2624                 substep = _mm_slli_epi32(substep, 1);
2625                 if (filter)
2626                 {
2627                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2628                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2629                         {
2630                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2631                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2632                                 {
2633                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2634                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2635                                         tci = _mm_madd_epi16(tci, tcoffset);
2636                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2637                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2638                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2639                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2640                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2641                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2642                                         fracm = _mm_srli_epi16(subtc, 1);
2643                                         pix1 = _mm_add_epi16(pix1,
2644                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2645                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2646                                         pix3 = _mm_add_epi16(pix3,
2647                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2648                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2649                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2650                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2651                                         pix2 = _mm_add_epi16(pix2,
2652                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2653                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2654                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2655                                 }
2656                                 if (x <= endsub)
2657                                 {
2658                                         const unsigned char * RESTRICT ptr1;
2659                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2660                                         tci = _mm_madd_epi16(tci, tcoffset);
2661                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2662                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2663                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2664                                         fracm = _mm_srli_epi16(subtc, 1);
2665                                         pix1 = _mm_add_epi16(pix1,
2666                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2667                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2668                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2669                                         pix1 = _mm_add_epi16(pix1,
2670                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2671                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2672                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2673                                         x++;
2674                                 }
2675                         }
2676                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2677                         {
2678                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2679                                 {
2680                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2681                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2682                                         tci = _mm_madd_epi16(tci, tcoffset);
2683                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2684                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2685                                                                                         _mm_setzero_si128());
2686                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2687                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2688                                                                                         _mm_setzero_si128());
2689                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2690                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2691                                         tci = _mm_madd_epi16(tci, tcoffset);
2692                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2693                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2694                                                                                         _mm_setzero_si128());
2695                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2696                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2697                                                                                         _mm_setzero_si128());
2698                                         fracm = _mm_srli_epi16(subtc, 1);
2699                                         pix1 = _mm_add_epi16(pix1,
2700                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2701                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2702                                         pix3 = _mm_add_epi16(pix3,
2703                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2704                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2705                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2706                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2707                                         pix2 = _mm_add_epi16(pix2,
2708                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2709                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2710                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2711                                 }
2712                                 if (x <= endsub)
2713                                 {
2714                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2715                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2716                                         tci = _mm_madd_epi16(tci, tcoffset);
2717                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2718                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2719                                                                                         _mm_setzero_si128());
2720                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2721                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2722                                                                                         _mm_setzero_si128());
2723                                         fracm = _mm_srli_epi16(subtc, 1);
2724                                         pix1 = _mm_add_epi16(pix1,
2725                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2726                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2727                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2728                                         pix1 = _mm_add_epi16(pix1,
2729                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2730                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2731                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2732                                         x++;
2733                                 }
2734                         }
2735                         else
2736                         {
2737                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2738                                 {
2739                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2740                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2741                                         tci = _mm_madd_epi16(tci, tcoffset);
2742                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2743                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2744                                                                                         _mm_setzero_si128());
2745                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2746                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2747                                                                                         _mm_setzero_si128());
2748                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2749                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2750                                         tci = _mm_madd_epi16(tci, tcoffset);
2751                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2752                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2753                                                                                         _mm_setzero_si128());
2754                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2755                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2756                                                                                         _mm_setzero_si128());
2757                                         fracm = _mm_srli_epi16(subtc, 1);
2758                                         pix1 = _mm_add_epi16(pix1,
2759                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2760                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2761                                         pix3 = _mm_add_epi16(pix3,
2762                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2763                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2764                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2765                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2766                                         pix2 = _mm_add_epi16(pix2,
2767                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2768                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2769                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2770                                 }
2771                                 if (x <= endsub)
2772                                 {
2773                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2774                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2775                                         tci = _mm_madd_epi16(tci, tcoffset);
2776                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2777                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2778                                                                                         _mm_setzero_si128());
2779                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2780                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2781                                                                                         _mm_setzero_si128());
2782                                         fracm = _mm_srli_epi16(subtc, 1);
2783                                         pix1 = _mm_add_epi16(pix1,
2784                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2785                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2786                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2787                                         pix1 = _mm_add_epi16(pix1,
2788                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2789                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2790                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2791                                         x++;
2792                                 }
2793                         }
2794                 }
2795                 else
2796                 {
2797                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2798                         {
2799                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2800                                 {
2801                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2802                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2803                                         tci = _mm_madd_epi16(tci, tcoffset);
2804                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2805                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2806                                 }
2807                                 if (x <= endsub)
2808                                 {
2809                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2810                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2811                                         tci = _mm_madd_epi16(tci, tcoffset);
2812                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2813                                         x++;
2814                                 }
2815                         }
2816                         else
2817                         {
2818                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2819                                 {
2820                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2821                                         tci = _mm_and_si128(tci, tcmax); 
2822                                         tci = _mm_madd_epi16(tci, tcoffset);
2823                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2824                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2825                                 }
2826                                 if (x <= endsub)
2827                                 {
2828                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2829                                         tci = _mm_and_si128(tci, tcmax); 
2830                                         tci = _mm_madd_epi16(tci, tcoffset);
2831                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2832                                         x++;
2833                                 }
2834                         }
2835                 }
2836         }
2837 #endif
2838 }
2839
2840 static void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2841 {
2842         // TODO: IMPLEMENT
2843         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2844 }
2845
2846 static float DPSOFTRAST_SampleShadowmap(const float *vector)
2847 {
2848         // TODO: IMPLEMENT
2849         return 1.0f;
2850 }
2851
2852 #if 0
2853 static void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2854 {
2855         int x;
2856         int startx = span->startx;
2857         int endx = span->endx;
2858         float c[4];
2859         float data[4];
2860         float slope[4];
2861         float z;
2862         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2863         for (x = startx;x < endx;x++)
2864         {
2865                 z = zf[x];
2866                 c[0] = (data[0] + slope[0]*x) * z;
2867                 c[1] = (data[1] + slope[1]*x) * z;
2868                 c[2] = (data[2] + slope[2]*x) * z;
2869                 c[3] = (data[3] + slope[3]*x) * z;
2870                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2871                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2872                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2873                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2874         }
2875 }
2876 #endif
2877
2878 #if 0
2879 static void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2880 {
2881         int x;
2882         int startx = span->startx;
2883         int endx = span->endx;
2884         float c[4];
2885         float data[4];
2886         float slope[4];
2887         float z;
2888         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2889         for (x = startx;x < endx;x++)
2890         {
2891                 z = zf[x];
2892                 c[0] = (data[0] + slope[0]*x) * z;
2893                 c[1] = (data[1] + slope[1]*x) * z;
2894                 c[2] = (data[2] + slope[2]*x) * z;
2895                 c[3] = (data[3] + slope[3]*x) * z;
2896                 out4f[x*4+0] = c[0];
2897                 out4f[x*4+1] = c[1];
2898                 out4f[x*4+2] = c[2];
2899                 out4f[x*4+3] = c[3];
2900         }
2901 }
2902 #endif
2903
2904 #if 0
2905 static void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2906 {
2907         int x, startx = span->startx, endx = span->endx;
2908         float c[4], localcolor[4];
2909         localcolor[0] = subcolor[0];
2910         localcolor[1] = subcolor[1];
2911         localcolor[2] = subcolor[2];
2912         localcolor[3] = subcolor[3];
2913         for (x = startx;x < endx;x++)
2914         {
2915                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2916                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2917                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2918                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2919                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2920                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2921                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2922                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2923         }
2924 }
2925 #endif
2926
2927 #if 0
2928 static void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2929 {
2930         int x, startx = span->startx, endx = span->endx;
2931         for (x = startx;x < endx;x++)
2932         {
2933                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2934                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2935                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2936                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2937         }
2938 }
2939 #endif
2940
2941 #if 0
2942 static void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2943 {
2944         int x, startx = span->startx, endx = span->endx;
2945         for (x = startx;x < endx;x++)
2946         {
2947                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2948                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2949                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2950                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2951         }
2952 }
2953 #endif
2954
2955 #if 0
2956 static void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2957 {
2958         int x, startx = span->startx, endx = span->endx;
2959         float a, b;
2960         for (x = startx;x < endx;x++)
2961         {
2962                 a = 1.0f - inb4f[x*4+3];
2963                 b = inb4f[x*4+3];
2964                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2965                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2966                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2967                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2968         }
2969 }
2970 #endif
2971
2972 #if 0
2973 static void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2974 {
2975         int x, startx = span->startx, endx = span->endx;
2976         float localcolor[4], ilerp, lerp;
2977         localcolor[0] = color[0];
2978         localcolor[1] = color[1];
2979         localcolor[2] = color[2];
2980         localcolor[3] = color[3];
2981         ilerp = 1.0f - localcolor[3];
2982         lerp = localcolor[3];
2983         for (x = startx;x < endx;x++)
2984         {
2985                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2986                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2987                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2988                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2989         }
2990 }
2991 #endif
2992
2993
2994
2995 static void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2996 {
2997 #ifdef SSE_POSSIBLE
2998         int x;
2999         int startx = span->startx;
3000         int endx = span->endx;
3001         __m128 data, slope;
3002         __m128 mod, endmod;
3003         __m128i submod, substep, endsubmod;
3004         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3005         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3006         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3007         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3008         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3009         for (x = startx; x < endx;)
3010         {
3011                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3012                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3013                 if (nextsub >= endx)
3014                 {
3015                         nextsub = endsub = endx-1;
3016                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
3017                 }
3018                 mod = endmod;
3019                 submod = endsubmod;
3020                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3021                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3022                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3023                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3024                 substep = _mm_packs_epi32(substep, substep);
3025                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3026                 {
3027                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3028                         pix = _mm_mulhi_epu16(pix, submod);
3029                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3030                 }
3031                 if (x <= endsub)
3032                 {
3033                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3034                         pix = _mm_mulhi_epu16(pix, submod);
3035                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3036                         x++;
3037                 }
3038         }
3039 #endif
3040 }
3041
3042 static void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3043 {
3044 #ifdef SSE_POSSIBLE
3045         int x;
3046         int startx = span->startx;
3047         int endx = span->endx;
3048         __m128 data, slope;
3049         __m128 mod, endmod;
3050         __m128i submod, substep, endsubmod;
3051         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3052         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3053         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3054         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3055         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3056         for (x = startx; x < endx;)
3057         {
3058                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3059                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3060                 if (nextsub >= endx)
3061                 {
3062                         nextsub = endsub = endx-1;
3063                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3064                 }
3065                 mod = endmod;
3066                 submod = endsubmod;
3067                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3068                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3069                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3070                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3071                 substep = _mm_packs_epi32(substep, substep);
3072                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3073                 {
3074                         __m128i pix = _mm_srai_epi16(submod, 4);
3075                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3076                 }
3077                 if (x <= endsub)
3078                 {
3079                         __m128i pix = _mm_srai_epi16(submod, 4);
3080                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3081                         x++;
3082                 }
3083         }
3084 #endif
3085 }
3086
3087 static void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3088 {
3089 #ifdef SSE_POSSIBLE
3090         int x, startx = span->startx, endx = span->endx;
3091         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3092         localcolor = _mm_packs_epi32(localcolor, localcolor);
3093         for (x = startx;x+2 <= endx;x+=2)
3094         {
3095                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3096                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3097                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3098                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3099         }
3100         if (x < endx)
3101         {
3102                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3103                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3104                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3105                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3106         }
3107 #endif
3108 }
3109
3110 static void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3111 {
3112 #ifdef SSE_POSSIBLE
3113         int x, startx = span->startx, endx = span->endx;
3114         for (x = startx;x+2 <= endx;x+=2)
3115         {
3116                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3117                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3118                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3119                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3120         }
3121         if (x < endx)
3122         {
3123                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3124                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3125                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3126                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3127         }
3128 #endif
3129 }
3130
3131 static void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3132 {
3133 #ifdef SSE_POSSIBLE
3134         int x, startx = span->startx, endx = span->endx;
3135         for (x = startx;x+2 <= endx;x+=2)
3136         {
3137                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3138                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3139                 pix1 = _mm_add_epi16(pix1, pix2);
3140                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3141         }
3142         if (x < endx)
3143         {
3144                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3145                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3146                 pix1 = _mm_add_epi16(pix1, pix2);
3147                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3148         }
3149 #endif
3150 }
3151
3152 #if 0
3153 static void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3154 {
3155 #ifdef SSE_POSSIBLE
3156         int x, startx = span->startx, endx = span->endx;
3157         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3158         tint = _mm_packs_epi32(tint, tint);
3159         for (x = startx;x+2 <= endx;x+=2)
3160         {
3161                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3162                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3163                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3164                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3165         }
3166         if (x < endx)
3167         {
3168                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3169                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3170                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3171                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3172         }
3173 #endif
3174 }
3175 #endif
3176
3177 static void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3178 {
3179 #ifdef SSE_POSSIBLE
3180         int x, startx = span->startx, endx = span->endx;
3181         for (x = startx;x+2 <= endx;x+=2)
3182         {
3183                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3184                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3185                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3186                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3187                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3188         }
3189         if (x < endx)
3190         {
3191                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3192                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3193                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3194                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3195                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3196         }
3197 #endif
3198 }
3199
3200 static void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3201 {
3202 #ifdef SSE_POSSIBLE
3203         int x, startx = span->startx, endx = span->endx;
3204         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3205         localcolor = _mm_packs_epi32(localcolor, localcolor);
3206         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3207         for (x = startx;x+2 <= endx;x+=2)
3208         {
3209                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3210                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3211                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3212         }
3213         if (x < endx)
3214         {
3215                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3216                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3217                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3218         }
3219 #endif
3220 }
3221
3222
3223
3224 static void DPSOFTRAST_VertexShader_Generic(void)
3225 {
3226         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3227         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3228         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3229         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3230                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3231 }
3232
3233 static void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3234 {
3235         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3236         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3237         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3238         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3239         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3240         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3241         {
3242                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3243                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3244                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3245                 {
3246                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3247                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3248                         {
3249                                 // multiply
3250                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3251                         }
3252                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3253                         {
3254                                 // add
3255                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3256                         }
3257                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3258                         {
3259                                 // alphablend
3260                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3261                         }
3262                 }
3263         }
3264         else
3265                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3266         if(thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL)
3267         {
3268                 int x;
3269                 for (x = span->startx;x < span->endx;x++)
3270                         buffer_FragColorbgra8[x*4+3] = buffer_FragColorbgra8[x*4+3] * thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3271         }
3272         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3273 }
3274
3275
3276
3277 static void DPSOFTRAST_VertexShader_PostProcess(void)
3278 {
3279         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3280         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3281         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3282 }
3283
3284 static void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3285 {
3286         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3287         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3288         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3289         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3290         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3291         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3292         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3293         {
3294                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3295                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3296         }
3297         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3298         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3299         {
3300                 // TODO: implement saturation
3301         }
3302         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3303         {
3304                 // TODO: implement gammaramps
3305         }
3306         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3307 }
3308
3309
3310
3311 static void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3312 {
3313         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3314 }
3315
3316 static void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3317 {
3318         // this is never called (because colormask is off when this shader is used)
3319         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3320         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3321         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3322         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3323         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3324 }
3325
3326
3327
3328 static void DPSOFTRAST_VertexShader_FlatColor(void)
3329 {
3330         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3331         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3332 }
3333
3334 static void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3335 {
3336 #ifdef SSE_POSSIBLE
3337         unsigned char * RESTRICT pixelmask = span->pixelmask;
3338         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3339         int x, startx = span->startx, endx = span->endx;
3340         __m128i Color_Ambientm;
3341         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3342         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3343         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3344         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3345         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3346         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3347                 pixel = buffer_FragColorbgra8;
3348         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3349         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3350         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3351         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3352         for (x = startx;x < endx;x++)
3353         {
3354                 __m128i color, pix;
3355                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3356                 {
3357                         __m128i pix2;
3358                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3359                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3360                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3361                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3362                         x += 3;
3363                         continue;
3364                 }
3365                 if (!pixelmask[x])
3366                         continue;
3367                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3368                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3369                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3370         }
3371         if (pixel == buffer_FragColorbgra8)
3372                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3373 #endif
3374 }
3375
3376
3377
3378 static void DPSOFTRAST_VertexShader_VertexColor(void)
3379 {
3380         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3381         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3382         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3383 }
3384
3385 static void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3386 {
3387 #ifdef SSE_POSSIBLE
3388         unsigned char * RESTRICT pixelmask = span->pixelmask;
3389         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3390         int x, startx = span->startx, endx = span->endx;
3391         __m128i Color_Ambientm, Color_Diffusem;
3392         __m128 data, slope;
3393         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3394         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3395         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3396         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3397         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3398         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3399         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3400                 pixel = buffer_FragColorbgra8;
3401         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3402         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3403         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3404         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3405         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3406         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3407         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3408         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3409         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3410         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3411         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3412         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3413         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3414         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3415         {
3416                 __m128i color, mod, pix;
3417                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3418                 {
3419                         __m128i pix2, mod2;
3420                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3421                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3422                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3423                         data = _mm_add_ps(data, slope);
3424                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3425                         data = _mm_add_ps(data, slope);
3426                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3427                         data = _mm_add_ps(data, slope);
3428                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3429                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3430                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3431                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3432                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3433                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3434                         x += 3;
3435                         continue;
3436                 }
3437                 if (!pixelmask[x])
3438                         continue;
3439                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3440                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3441                 mod = _mm_packs_epi32(mod, mod);
3442                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3443                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3444         }
3445         if (pixel == buffer_FragColorbgra8)
3446                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3447 #endif
3448 }
3449
3450
3451
3452 static void DPSOFTRAST_VertexShader_Lightmap(void)
3453 {
3454         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3455         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3456         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3457 }
3458
3459 static void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3460 {
3461 #ifdef SSE_POSSIBLE
3462         unsigned char * RESTRICT pixelmask = span->pixelmask;
3463         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3464         int x, startx = span->startx, endx = span->endx;
3465         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3466         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3467         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3468         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3469         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3470         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3471         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3472         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3473         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3474         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3475                 pixel = buffer_FragColorbgra8;
3476         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3477         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3478         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3479         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3480         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3481         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3482         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3483         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3484         {
3485                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3486                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3487                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3488                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3489                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3490                 for (x = startx;x < endx;x++)
3491                 {
3492                         __m128i color, lightmap, glow, pix;
3493                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3494                         {
3495                                 __m128i pix2;
3496                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3497                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3498                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3499                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3500                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3501                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3502                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3503                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3504                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3505                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3506                                 x += 3;
3507                                 continue;
3508                         }
3509                         if (!pixelmask[x])
3510                                 continue;
3511                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3512                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3513                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3514                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3515                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3516                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3517                 }
3518         }
3519         else
3520         {
3521                 for (x = startx;x < endx;x++)
3522                 {
3523                         __m128i color, lightmap, pix;
3524                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3525                         {
3526                                 __m128i pix2;
3527                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3528                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3529                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3530                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3531                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3532                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3533                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3534                                 x += 3;
3535                                 continue;
3536                         }
3537                         if (!pixelmask[x]) 
3538                                 continue;
3539                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3540                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3541                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3542                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3543                 }
3544         }
3545         if (pixel == buffer_FragColorbgra8)
3546                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3547 #endif
3548 }
3549
3550
3551 void DPSOFTRAST_VertexShader_LightDirection(void);
3552 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3553
3554 static void DPSOFTRAST_VertexShader_FakeLight(void)
3555 {
3556         DPSOFTRAST_VertexShader_LightDirection();
3557 }
3558
3559 static void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3560 {
3561         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3562 }
3563
3564
3565
3566 static void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3567 {
3568         DPSOFTRAST_VertexShader_LightDirection();
3569         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3570 }
3571
3572 static void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3573 {
3574         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3575 }
3576
3577
3578
3579 static void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3580 {
3581         DPSOFTRAST_VertexShader_LightDirection();
3582         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3583 }
3584
3585 static void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3586 {
3587         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3588 }
3589
3590
3591
3592 void DPSOFTRAST_VertexShader_LightDirection(void)
3593 {
3594         int i;
3595         int numvertices = dpsoftrast.numvertices;
3596         float LightDir[4];
3597         float LightVector[4];
3598         float EyePosition[4];
3599         float EyeVectorModelSpace[4];
3600         float EyeVector[4];
3601         float position[4];
3602         float svector[4];
3603         float tvector[4];
3604         float normal[4];
3605         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3606         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3607         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3608         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3609         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3610         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3611         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3612         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3613         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3614         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3615         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3616         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3617         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3618         for (i = 0;i < numvertices;i++)
3619         {
3620                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3621                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3622                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3623                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3624                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3625                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3626                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3627                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3628                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3629                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3630                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3631                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3632                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3633                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3634                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3635                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3636                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3637                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3638                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3639                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3640                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3641                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3642                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3643                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3644                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3645                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3646                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3647                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3648                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3649         }
3650         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3651 }
3652
3653 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3654 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3655 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3656 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3657 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3658 #define DPSOFTRAST_Vector3Normalize(v)\
3659 do\
3660 {\
3661         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3662         if (len)\
3663         {\
3664                 len = 1.0f / len;\
3665                 v[0] *= len;\
3666                 v[1] *= len;\
3667                 v[2] *= len;\
3668         }\
3669 }\
3670 while(0)
3671
3672 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3673 {
3674         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3675         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3676         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3677         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3678         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3679         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3680         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3681         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3682         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3683         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3684         int x, startx = span->startx, endx = span->endx;
3685         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3686         float LightVectordata[4];
3687         float LightVectorslope[4];
3688         float EyeVectordata[4];
3689         float EyeVectorslope[4];
3690         float VectorSdata[4];
3691         float VectorSslope[4];
3692         float VectorTdata[4];
3693         float VectorTslope[4];
3694         float VectorRdata[4];
3695         float VectorRslope[4];
3696         float z;
3697         float diffusetex[4];
3698         float glosstex[4];
3699         float surfacenormal[4];
3700         float lightnormal[4];
3701         float lightnormal_modelspace[4];
3702         float eyenormal[4];
3703         float specularnormal[4];
3704         float diffuse;
3705         float specular;
3706         float SpecularPower;
3707         int d[4];
3708         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3709         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3710         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3711         Color_Glow[3] = 0.0f;
3712         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3713         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3714         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3715         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3716         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3717         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3718         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3719         Color_Pants[3] = 0.0f;
3720         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3721         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3722         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3723         Color_Shirt[3] = 0.0f;
3724         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3725         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3726         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3727         {
3728                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3729                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3730         }
3731         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3732         {
3733                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3734         }
3735         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3736         {
3737                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3738                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3739                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3740                 Color_Diffuse[3] = 0.0f;
3741                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3742                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3743                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3744                 LightColor[3] = 0.0f;
3745                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3746                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3747                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3748                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3749                 Color_Specular[3] = 0.0f;
3750                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3751                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3752                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3753
3754                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3755                 {
3756                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3757                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3758                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3759                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3760                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3761                 }
3762                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3763                 {
3764                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3765                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3766                 }
3767                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3768                 {
3769                         // nothing of this needed
3770                 }
3771                 else
3772                 {
3773                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3774                 }
3775
3776                 for (x = startx;x < endx;x++)
3777                 {
3778                         z = buffer_z[x];
3779                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3780                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3781                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3782                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3783                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3784                         {
3785                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3786                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3787                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3788                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3789                         }
3790                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3791                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3792                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3793                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3794                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3795                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3796                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3797                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3798
3799                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3800                         {
3801                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3802                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3803                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3804                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3805
3806                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3807                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3808                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3809                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3810
3811                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3812                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3813                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3814                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3815
3816                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3817                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3818                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3819                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3820
3821                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3822                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3823
3824                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3825                                 {
3826                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3827                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3828                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3829                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3830                                 }
3831                         }
3832                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3833                         {
3834                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3835                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3836                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3837                                 {
3838                                         float f = 1.0f / 256.0f;
3839                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3840                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3841                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3842                                 }
3843                         }
3844                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3845                         {
3846                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3847                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3848                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3849                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3850
3851                                 LightColor[0] = 1.0;
3852                                 LightColor[1] = 1.0;
3853                                 LightColor[2] = 1.0;
3854                         }
3855                         else
3856                         {
3857                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3858                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3859                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3860                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3861                         }
3862
3863                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3864
3865                         if(thread->shader_exactspecularmath)
3866                         {
3867                                 // reflect lightnormal at surfacenormal, take the negative of that
3868                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3869                                 float f;
3870                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3871                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3872                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3873                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3874
3875                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
3876                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3877                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3878                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3879                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3880
3881                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3882                         }
3883                         else
3884                         {
3885                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3886                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3887                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3888                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3889
3890                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
3891                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
3892                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
3893                                 DPSOFTRAST_Vector3Normalize(specularnormal);
3894
3895                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3896                         }
3897                         specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
3898
3899                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3900                         {
3901                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3902                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3903                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3904                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3905                         }
3906                         else
3907                         {
3908                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3909                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3910                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3911                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3912                         }
3913
3914                         buffer_FragColorbgra8[x*4+0] = d[0];
3915                         buffer_FragColorbgra8[x*4+1] = d[1];
3916                         buffer_FragColorbgra8[x*4+2] = d[2];
3917                         buffer_FragColorbgra8[x*4+3] = d[3];
3918                 }
3919         }
3920         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3921         {
3922                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3923                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3924                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3925                 Color_Diffuse[3] = 0.0f;
3926                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3927                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3928                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3929                 LightColor[3] = 0.0f;
3930                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3931
3932                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3933                 {
3934                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3935                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3936                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3937                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3938                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3939                 }
3940                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3941                 {
3942                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3943                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3944                 }
3945                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3946                 {
3947                         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3948                 }
3949                 else
3950                 {
3951                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3952                 }
3953
3954                 for (x = startx;x < endx;x++)
3955                 {
3956                         z = buffer_z[x];
3957                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3958                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3959                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3960                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3961                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3962                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3963                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3964                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3965
3966                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3967                         {
3968                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3969                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3970                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3971                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3972
3973                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3974                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3975                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3976                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3977
3978                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3979                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3980                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3981                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3982
3983                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3984                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3985                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3986                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3987
3988                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3989                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3990
3991                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3992                                 {
3993                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3994                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3995                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3996                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3997                                 }
3998                         }
3999                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
4000                         {
4001                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4002                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4003                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4004                                 {
4005                                         float f = 1.0f / 256.0f;
4006                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
4007                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
4008                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
4009                                 }
4010                         }
4011                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
4012                         {
4013                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4014                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4015                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4016                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4017
4018                                 LightColor[0] = 1.0;
4019                                 LightColor[1] = 1.0;
4020                                 LightColor[2] = 1.0;
4021                         }
4022                         else
4023                         {
4024                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4025                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4026                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4027                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4028                         }
4029
4030                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4031                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4032                         {
4033                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4034                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4035                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4036                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4037                         }
4038                         else
4039                         {
4040                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4041                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4042                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4043                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4044                         }
4045                         buffer_FragColorbgra8[x*4+0] = d[0];
4046                         buffer_FragColorbgra8[x*4+1] = d[1];
4047                         buffer_FragColorbgra8[x*4+2] = d[2];
4048                         buffer_FragColorbgra8[x*4+3] = d[3];
4049                 }
4050         }
4051         else
4052         {
4053                 for (x = startx;x < endx;x++)
4054                 {
4055                         z = buffer_z[x];
4056                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4057                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4058                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4059                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4060
4061                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4062                         {
4063                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4064                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4065                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4066                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4067                         }
4068                         else
4069                         {
4070                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4071                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4072                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4073                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4074                         }
4075                         buffer_FragColorbgra8[x*4+0] = d[0];
4076                         buffer_FragColorbgra8[x*4+1] = d[1];
4077                         buffer_FragColorbgra8[x*4+2] = d[2];
4078                         buffer_FragColorbgra8[x*4+3] = d[3];
4079                 }
4080         }
4081         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4082 }
4083
4084
4085
4086 static void DPSOFTRAST_VertexShader_LightSource(void)
4087 {
4088         int i;
4089         int numvertices = dpsoftrast.numvertices;
4090         float LightPosition[4];
4091         float LightVector[4];
4092         float LightVectorModelSpace[4];
4093         float EyePosition[4];
4094         float EyeVectorModelSpace[4];
4095         float EyeVector[4];
4096         float position[4];
4097         float svector[4];
4098         float tvector[4];
4099         float normal[4];
4100         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4101         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4102         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4103         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4104         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4105         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4106         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4107         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4108         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4109         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4110         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4111         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4112         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4113         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4114         for (i = 0;i < numvertices;i++)
4115         {
4116                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4117                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4118                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4119                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4120                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4121                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4122                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4123                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4124                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4125                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4126                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4127                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4128                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4129                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4130                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4131                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4132                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4133                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
4134                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4135                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4136                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4137                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4138                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4139                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4140                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4141                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4142                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4143                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4144                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4145                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4146                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4147                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4148         }
4149         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4150         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4151 }
4152
4153 static void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4154 {
4155 #ifdef SSE_POSSIBLE
4156         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4157         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4158         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4159         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4160         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4161         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4162         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4163         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4164         int x, startx = span->startx, endx = span->endx;
4165         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4166         float CubeVectordata[4];
4167         float CubeVectorslope[4];
4168         float LightVectordata[4];
4169         float LightVectorslope[4];
4170         float EyeVectordata[4];
4171         float EyeVectorslope[4];
4172         float z;
4173         float diffusetex[4];
4174         float glosstex[4];
4175         float surfacenormal[4];
4176         float lightnormal[4];
4177         float eyenormal[4];
4178         float specularnormal[4];
4179         float diffuse;
4180         float specular;
4181         float SpecularPower;
4182         float CubeVector[4];
4183         float attenuation;
4184         int d[4];
4185         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4186         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4187         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4188         Color_Glow[3] = 0.0f;
4189         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4190         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4191         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4192         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4193         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4194         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4195         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4196         Color_Diffuse[3] = 0.0f;
4197         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4198         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4199         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4200         Color_Specular[3] = 0.0f;
4201         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4202         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4203         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4204         Color_Pants[3] = 0.0f;
4205         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4206         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4207         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4208         Color_Shirt[3] = 0.0f;
4209         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4210         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4211         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4212         LightColor[3] = 0.0f;
4213         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4214         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4215         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4216         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4217         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4218         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4219         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4220         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4221         {
4222                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4223                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4224         }
4225         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4226                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4227         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4228         {
4229                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4230                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4231                 for (x = startx;x < endx;x++)
4232                 {
4233                         z = buffer_z[x];
4234                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4235                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4236                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4237                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4238                         if (attenuation < 0.01f)
4239                                 continue;
4240                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4241                         {
4242                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4243                                 if (attenuation < 0.01f)
4244                                         continue;
4245                         }
4246
4247                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4248                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4249                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4250                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4251                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4252                         {
4253                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4254                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4255                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4256                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4257                         }
4258                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4259                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4260                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4261                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4262                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4263                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4264                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4265                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4266
4267                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4268                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4269                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4270                         DPSOFTRAST_Vector3Normalize(lightnormal);
4271
4272                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4273
4274                         if(thread->shader_exactspecularmath)
4275                         {
4276                                 // reflect lightnormal at surfacenormal, take the negative of that
4277                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4278                                 float f;
4279                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4280                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4281                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4282                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4283
4284                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
4285                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4286                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4287                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4288                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4289
4290                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4291                         }
4292                         else
4293                         {
4294                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4295                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4296                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4297                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4298
4299                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
4300                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
4301                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
4302                                 DPSOFTRAST_Vector3Normalize(specularnormal);
4303
4304                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4305                         }
4306                         specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
4307
4308                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4309                         {
4310                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4311                                 attenuation *= (1.0f / 255.0f);
4312                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4313                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4314                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4315                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4316                         }
4317                         else
4318                         {
4319                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4320                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4321                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4322                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4323                         }
4324                         buffer_FragColorbgra8[x*4+0] = d[0];
4325                         buffer_FragColorbgra8[x*4+1] = d[1];
4326                         buffer_FragColorbgra8[x*4+2] = d[2];
4327                         buffer_FragColorbgra8[x*4+3] = d[3];
4328                 }
4329         }
4330         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4331         {
4332                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4333                 for (x = startx;x < endx;x++)
4334                 {
4335                         z = buffer_z[x];
4336                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4337                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4338                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4339                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4340                         if (attenuation < 0.01f)
4341                                 continue;
4342                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4343                         {
4344                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4345                                 if (attenuation < 0.01f)
4346                                         continue;
4347                         }
4348
4349                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4350                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4351                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4352                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4353                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4354                         {
4355                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4356                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4357                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4358                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4359                         }
4360                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4361                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4362                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4363                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4364
4365                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4366                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4367                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4368                         DPSOFTRAST_Vector3Normalize(lightnormal);
4369
4370                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4371                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4372                         {
4373                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4374                                 attenuation *= (1.0f / 255.0f);
4375                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4376                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4377                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4378                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4379                         }
4380                         else
4381                         {
4382                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4383                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4384                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4385                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4386                         }
4387                         buffer_FragColorbgra8[x*4+0] = d[0];
4388                         buffer_FragColorbgra8[x*4+1] = d[1];
4389                         buffer_FragColorbgra8[x*4+2] = d[2];
4390                         buffer_FragColorbgra8[x*4+3] = d[3];
4391                 }
4392         }
4393         else
4394         {
4395                 for (x = startx;x < endx;x++)
4396                 {
4397                         z = buffer_z[x];
4398                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4399                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4400                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4401                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4402                         if (attenuation < 0.01f)
4403                                 continue;
4404                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4405                         {
4406                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4407                                 if (attenuation < 0.01f)
4408                                         continue;
4409                         }
4410
4411                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4412                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4413                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4414                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4415                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4416                         {
4417                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4418                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4419                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4420                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4421                         }
4422                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4423                         {
4424                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4425                                 attenuation *= (1.0f / 255.0f);
4426                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4427                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4428                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4429                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4430                         }
4431                         else
4432                         {
4433                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4434                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4435                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4436                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4437                         }
4438                         buffer_FragColorbgra8[x*4+0] = d[0];
4439                         buffer_FragColorbgra8[x*4+1] = d[1];
4440                         buffer_FragColorbgra8[x*4+2] = d[2];
4441                         buffer_FragColorbgra8[x*4+3] = d[3];
4442                 }
4443         }
4444         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4445 #endif
4446 }
4447
4448
4449
4450 static void DPSOFTRAST_VertexShader_Refraction(void)
4451 {
4452         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4453         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4454         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4455 }
4456
4457 static void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4458 {
4459         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4460         float z;
4461         int x, startx = span->startx, endx = span->endx;
4462
4463         // texture reads
4464         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4465         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4466
4467         // varyings
4468         float ModelViewProjectionPositiondata[4];
4469         float ModelViewProjectionPositionslope[4];
4470
4471         // uniforms
4472         float ScreenScaleRefractReflect[2];
4473         float ScreenCenterRefractReflect[2];
4474         float DistortScaleRefractReflect[2];
4475         float RefractColor[4];
4476
4477         DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4478         if(!texture) return;
4479
4480         // read textures
4481         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4482         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4483
4484         // read varyings
4485         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4486
4487         // read uniforms
4488         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4489         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4490         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4491         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4492         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4493         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4494         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4495         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4496         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4497         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4498
4499         // do stuff
4500         for (x = startx;x < endx;x++)
4501         {
4502                 float SafeScreenTexCoord[2];
4503                 float ScreenTexCoord[2];
4504                 float v[3];
4505                 float iw;
4506                 unsigned char c[4];
4507
4508                 z = buffer_z[x];
4509
4510                 // "    vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4511                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4512
4513                 // "    vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4514                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4515                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4516
4517                 // "    vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4518                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4519                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4520                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4521                 DPSOFTRAST_Vector3Normalize(v);
4522                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4523                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4524
4525                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4526                 DPSOFTRAST_Texture2DBGRA8(texture, 0, ScreenTexCoord[0], ScreenTexCoord[1], c);
4527
4528                 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4529                 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4530                 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4531                 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4532         }
4533
4534         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4535 }
4536
4537
4538
4539 static void DPSOFTRAST_VertexShader_Water(void)
4540 {
4541         int i;
4542         int numvertices = dpsoftrast.numvertices;
4543         float EyePosition[4];
4544         float EyeVectorModelSpace[4];
4545         float EyeVector[4];
4546         float position[4];
4547         float svector[4];
4548         float tvector[4];
4549         float normal[4];
4550         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4551         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4552         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4553         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4554         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4555         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4556         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4557         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4558         for (i = 0;i < numvertices;i++)
4559         {
4560                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4561                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4562                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4563                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4564                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4565                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4566                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4567                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4568                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4569                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4570                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4571                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4572                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4573                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4574                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4575                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4576                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4577                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4578                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
4579                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
4580                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
4581                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
4582         }
4583         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4584         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4585         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4586 }
4587
4588
4589 static void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4590 {
4591         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4592         float z;
4593         int x, startx = span->startx, endx = span->endx;
4594
4595         // texture reads
4596         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4597         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4598
4599         // varyings
4600         float ModelViewProjectionPositiondata[4];
4601         float ModelViewProjectionPositionslope[4];
4602         float EyeVectordata[4];
4603         float EyeVectorslope[4];
4604
4605         // uniforms
4606         float ScreenScaleRefractReflect[4];
4607         float ScreenCenterRefractReflect[4];
4608         float DistortScaleRefractReflect[4];
4609         float RefractColor[4];
4610         float ReflectColor[4];
4611         float ReflectFactor;
4612         float ReflectOffset;
4613
4614         DPSOFTRAST_Texture *texture_refraction = thread->texbound[GL20TU_REFRACTION];
4615         DPSOFTRAST_Texture *texture_reflection = thread->texbound[GL20TU_REFLECTION];
4616         if(!texture_refraction || !texture_reflection) return;
4617
4618         // read textures
4619         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4620         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4621
4622         // read varyings
4623         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4624         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
4625
4626         // read uniforms
4627         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4628         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4629         ScreenScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+2];
4630         ScreenScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+3];
4631         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4632         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4633         ScreenCenterRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+2];
4634         ScreenCenterRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+3];
4635         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4636         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4637         DistortScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+2];
4638         DistortScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+3];
4639         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4640         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4641         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4642         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4643         ReflectColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+2];
4644         ReflectColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+1];
4645         ReflectColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+0];
4646         ReflectColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+3];
4647         ReflectFactor = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectFactor*4+0];
4648         ReflectOffset = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectOffset*4+0];
4649
4650         // do stuff
4651         for (x = startx;x < endx;x++)
4652         {
4653                 float SafeScreenTexCoord[4];
4654                 float ScreenTexCoord[4];
4655                 float v[3];
4656                 float iw;
4657                 unsigned char c1[4];
4658                 unsigned char c2[4];
4659                 float Fresnel;
4660
4661                 z = buffer_z[x];
4662
4663                 // "    vec4 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect * (1.0 / ModelViewProjectionPosition.w);\n"
4664                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4665
4666                 // "    vec4 SafeScreenTexCoord = ModelViewProjectionPosition.xyxy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect;\n"
4667                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4668                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4669                 SafeScreenTexCoord[2] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[2] + ScreenCenterRefractReflect[2]; // * z (disappears)
4670                 SafeScreenTexCoord[3] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[3] + ScreenCenterRefractReflect[3]; // * z (disappears)
4671
4672                 // "    vec4 ScreenTexCoord = SafeScreenTexCoord + vec2(normalize(vec3(dp_texture2D(Texture_Normal, TexCoord)) - vec3(0.5))).xyxy * DistortScaleRefractReflect;\n"
4673                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4674                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4675                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4676                 DPSOFTRAST_Vector3Normalize(v);
4677                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4678                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4679                 ScreenTexCoord[2] = SafeScreenTexCoord[2] + v[0] * DistortScaleRefractReflect[2];
4680                 ScreenTexCoord[3] = SafeScreenTexCoord[3] + v[1] * DistortScaleRefractReflect[3];
4681
4682                 // "    float Fresnel = pow(min(1.0, 1.0 - float(normalize(EyeVector).z)), 2.0) * ReflectFactor + ReflectOffset;\n"
4683                 v[0] = (EyeVectordata[0] + EyeVectorslope[0] * x); // * z (disappears)
4684                 v[1] = (EyeVectordata[1] + EyeVectorslope[1] * x); // * z (disappears)
4685                 v[2] = (EyeVectordata[2] + EyeVectorslope[2] * x); // * z (disappears)
4686                 DPSOFTRAST_Vector3Normalize(v);
4687                 Fresnel = 1.0f - v[2];
4688                 Fresnel = min(1.0f, Fresnel);
4689                 Fresnel = Fresnel * Fresnel * ReflectFactor + ReflectOffset;
4690
4691                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4692                 // "    dp_FragColor = mix(vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord.xy).rgb, 1) * RefractColor, vec4(dp_texture2D(Texture_Reflection, ScreenTexCoord.zw).rgb, 1) * ReflectColor, Fresnel);\n"
4693                 DPSOFTRAST_Texture2DBGRA8(texture_refraction, 0, ScreenTexCoord[0], ScreenTexCoord[1], c1);
4694                 DPSOFTRAST_Texture2DBGRA8(texture_reflection, 0, ScreenTexCoord[2], ScreenTexCoord[3], c2);
4695
4696                 buffer_FragColorbgra8[x*4+0] = (c1[0] * RefractColor[0]) * (1.0f - Fresnel) + (c2[0] * ReflectColor[0]) * Fresnel;
4697                 buffer_FragColorbgra8[x*4+1] = (c1[1] * RefractColor[1]) * (1.0f - Fresnel) + (c2[1] * ReflectColor[1]) * Fresnel;
4698                 buffer_FragColorbgra8[x*4+2] = (c1[2] * RefractColor[2]) * (1.0f - Fresnel) + (c2[2] * ReflectColor[2]) * Fresnel;
4699                 buffer_FragColorbgra8[x*4+3] = min((    RefractColor[3] *  (1.0f - Fresnel) +          ReflectColor[3]  * Fresnel) * 256, 255);
4700         }
4701
4702         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4703 }
4704
4705
4706
4707 static void DPSOFTRAST_VertexShader_ShowDepth(void)
4708 {
4709         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4710 }
4711
4712 static void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4713 {
4714         // TODO: IMPLEMENT
4715         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4716         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4717         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4718         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4719         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4720 }
4721
4722
4723
4724 static void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4725 {
4726         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4727 }
4728
4729 static void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4730 {
4731         // TODO: IMPLEMENT
4732         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4733         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4734         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4735         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4736         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4737 }
4738
4739
4740
4741 static void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4742 {
4743         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4744 }
4745
4746 static void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4747 {
4748         // TODO: IMPLEMENT
4749         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4750         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4751         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4752         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4753         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4754 }
4755
4756
4757
4758 typedef struct DPSOFTRAST_ShaderModeInfo_s
4759 {
4760         int lodarrayindex;
4761         void (*Vertex)(void);
4762         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4763         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4764         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4765 }
4766 DPSOFTRAST_ShaderModeInfo;
4767
4768 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4769 {
4770         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4771         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4772         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4773         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4774         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4775         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4776         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4777         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4778         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4779         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4780         {2, DPSOFTRAST_VertexShader_VertexColor,                        DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4781         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4782         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4783         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4784         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_NORMAL, GL20TU_REFLECTION, GL20TU_REFRACTION, ~0}},
4785         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4786         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4787         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4788 };
4789
4790 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4791 {
4792         int x;
4793         int startx;
4794         int endx;
4795         unsigned int *depthpixel;
4796         int depth;
4797         int depthslope;
4798         unsigned int d;
4799         unsigned char *pixelmask;
4800         DPSOFTRAST_State_Triangle *triangle;
4801         triangle = &thread->triangles[span->triangle];
4802         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4803         startx = span->startx;
4804         endx = span->endx;
4805         depth = span->depthbase;
4806         depthslope = span->depthslope;
4807         pixelmask = thread->pixelmaskarray;
4808         if (thread->depthtest && dpsoftrast.fb_depthpixels)
4809         {
4810                 switch(thread->fb_depthfunc)
4811                 {
4812                 default:
4813                 case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4814                 case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4815                 case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4816                 case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4817                 case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4818                 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4819                 case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4820                 }
4821                 while (startx < endx && !pixelmask[startx])
4822                         startx++;
4823                 while (endx > startx && !pixelmask[endx-1])
4824                         endx--;
4825         }
4826         else
4827         {
4828                 // no depth testing means we're just dealing with color...
4829                 memset(pixelmask + startx, 1, endx - startx);
4830         }
4831         span->pixelmask = pixelmask;
4832         span->startx = startx;
4833         span->endx = endx;
4834 }
4835
4836 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4837 {
4838         int x, d, depth, depthslope, startx, endx;
4839         const unsigned char *pixelmask;
4840         unsigned int *depthpixel;
4841         if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4842         {
4843                 depth = span->depthbase;
4844                 depthslope = span->depthslope;
4845                 pixelmask = span->pixelmask;
4846                 startx = span->startx;
4847                 endx = span->endx;
4848                 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4849                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4850                         if (pixelmask[x])
4851                                 depthpixel[x] = d;
4852         }
4853 }
4854
4855 static void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4856 {
4857         int i;
4858         DPSOFTRAST_State_Triangle *triangle;
4859         DPSOFTRAST_State_Span *span;
4860         for (i = 0; i < thread->numspans; i++)
4861         {
4862                 span = &thread->spans[i];
4863                 triangle = &thread->triangles[span->triangle];
4864                 DPSOFTRAST_Draw_DepthTest(thread, span);
4865                 if (span->startx >= span->endx)
4866                         continue;
4867                 // run pixel shader if appropriate
4868                 // do this before running depthmask code, to allow the pixelshader
4869                 // to clear pixelmask values for alpha testing
4870                 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4871                         DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4872                 DPSOFTRAST_Draw_DepthWrite(thread, span);
4873         }
4874         thread->numspans = 0;
4875 }
4876
4877 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;)
4878
4879 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4880 {
4881 #ifdef SSE_POSSIBLE
4882         int cullface = thread->cullface;
4883         int minx, maxx, miny, maxy;
4884         int miny1, maxy1, miny2, maxy2;
4885         __m128i fbmin, fbmax;
4886         __m128 viewportcenter, viewportscale;
4887         int firstvertex = command->firstvertex;
4888         int numvertices = command->numvertices;
4889         int numtriangles = command->numtriangles;
4890         const int *element3i = command->element3i;
4891         const unsigned short *element3s = command->element3s;
4892         int clipped = command->clipped;
4893         int i;
4894         int j;
4895         int k;
4896         int y;
4897         int e[3];
4898         __m128i screeny;
4899         int starty, endy, bandy;
4900         int numpoints;
4901         int clipcase;
4902         float clipdist[4];
4903         float clip0origin, clip0slope;
4904         int clip0dir;
4905         __m128 triangleedge1, triangleedge2, trianglenormal;
4906         __m128 clipfrac[3];
4907         __m128 screen[4];
4908         DPSOFTRAST_State_Triangle *triangle;
4909         DPSOFTRAST_Texture *texture;
4910         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4911         miny = thread->fb_scissor[1];
4912         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4913         miny1 = bound(miny, thread->miny1, maxy);
4914         maxy1 = bound(miny, thread->maxy1, maxy);
4915         miny2 = bound(miny, thread->miny2, maxy);
4916         maxy2 = bound(miny, thread->maxy2, maxy);
4917         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4918         {
4919                 if (!ATOMIC_DECREMENT(command->refcount))
4920                 {
4921                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4922                                 MM_FREE(command->arrays);
4923                 }
4924                 return;
4925         }
4926         minx = thread->fb_scissor[0];
4927         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4928         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4929         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4930         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4931         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4932         screen[3] = _mm_setzero_ps();
4933         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4934         for (i = 0;i < numtriangles;i++)
4935         {
4936                 const float *screencoord4f = command->arrays;
4937                 const float *arrays = screencoord4f + numvertices*4;
4938
4939                 // generate the 3 edges of this triangle
4940                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4941                 if (element3s)
4942                 {
4943                         e[0] = element3s[i*3+0] - firstvertex;
4944                         e[1] = element3s[i*3+1] - firstvertex;
4945                         e[2] = element3s[i*3+2] - firstvertex;
4946                 }
4947                 else if (element3i)
4948                 {
4949                         e[0] = element3i[i*3+0] - firstvertex;
4950                         e[1] = element3i[i*3+1] - firstvertex;
4951                         e[2] = element3i[i*3+2] - firstvertex;
4952                 }
4953                 else
4954                 {
4955                         e[0] = i*3+0;
4956                         e[1] = i*3+1;
4957                         e[2] = i*3+2;
4958                 }
4959
4960 #define SKIPBACKFACE \
4961                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4962                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4963                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4964                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4965                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4966                 switch(cullface) \
4967                 { \
4968                 case GL_BACK: \
4969                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4970                                 continue; \
4971                         break; \
4972                 case GL_FRONT: \
4973                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4974                                 continue; \
4975                         break; \
4976                 }
4977
4978 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4979                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4980                         { \
4981                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4982                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4983                         }
4984 #define CLIPPEDVERTEXCOPY(k,p1) \
4985                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4986
4987 #define GENATTRIBCOPY(attrib, p1) \
4988                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4989 #define GENATTRIBLERP(attrib, p1, p2) \
4990                 { \
4991                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4992                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4993                 }
4994 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4995                 switch(clipcase) \
4996                 { \
4997                 default: \
4998                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4999                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
5000                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
5001                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
5002                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
5003                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
5004                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
5005                 }
5006
5007                 if (! clipped)
5008                         goto notclipped;
5009
5010                 // calculate distance from nearplane
5011                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
5012                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
5013                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
5014                 if (clipdist[0] >= 0.0f)
5015                 {
5016                         if (clipdist[1] >= 0.0f)
5017                         {
5018                                 if (clipdist[2] >= 0.0f)
5019                                 {
5020                                 notclipped:
5021                                         // triangle is entirely in front of nearplane
5022                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
5023                                         SKIPBACKFACE;
5024                                         numpoints = 3;
5025                                         clipcase = 0;
5026                                 }
5027                                 else
5028                                 {
5029                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
5030                                         SKIPBACKFACE;
5031                                         numpoints = 4;
5032                                         clipcase = 1;
5033                                 }
5034                         }
5035                         else
5036                         {
5037                                 if (clipdist[2] >= 0.0f)
5038                                 {
5039                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
5040                                         SKIPBACKFACE;
5041                                         numpoints = 4;
5042                                         clipcase = 2;
5043                                 }
5044                                 else
5045                                 {
5046                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
5047                                         SKIPBACKFACE;
5048                                         numpoints = 3;
5049                                         clipcase = 3;
5050                                 }
5051                         }
5052                 }
5053                 else if (clipdist[1] >= 0.0f)
5054                 {
5055                         if (clipdist[2] >= 0.0f)
5056                         {
5057                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
5058                                 SKIPBACKFACE;
5059                                 numpoints = 4;
5060                                 clipcase = 4;
5061                         }
5062                         else
5063                         {
5064                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
5065                                 SKIPBACKFACE;
5066                                 numpoints = 3;
5067                                 clipcase = 5;
5068                         }
5069                 }
5070                 else if (clipdist[2] >= 0.0f)
5071                 {
5072                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
5073                         SKIPBACKFACE;
5074                         numpoints = 3;
5075                         clipcase = 6;
5076                 }
5077                 else continue; // triangle is entirely behind nearplane
5078
5079                 {
5080                         // calculate integer y coords for triangle points
5081                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
5082                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
5083                                         screenmin = _mm_min_epi16(screeni, screenir),
5084                                         screenmax = _mm_max_epi16(screeni, screenir);
5085                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
5086                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
5087                         screenmin = _mm_max_epi16(screenmin, fbmin);
5088                         screenmax = _mm_min_epi16(screenmax, fbmax);
5089                         // skip offscreen triangles
5090                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5091                                 continue;
5092                         starty = _mm_extract_epi16(screenmin, 1);
5093                         endy = _mm_extract_epi16(screenmax, 1)+1;
5094                         if (starty >= maxy1 && endy <= miny2)
5095                                 continue;
5096                         screeny = _mm_srai_epi32(screeni, 16);
5097                 }
5098
5099                 triangle = &thread->triangles[thread->numtriangles];
5100
5101                 // calculate attribute plans for triangle data...
5102                 // okay, this triangle is going to produce spans, we'd better project
5103                 // the interpolants now (this is what gives perspective texturing),
5104                 // this consists of simply multiplying all arrays by the W coord
5105                 // (which is basically 1/Z), which will be undone per-pixel
5106                 // (multiplying by Z again) to get the perspective-correct array
5107                 // values
5108                 {
5109                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5110                         __m128 mipedgescale, mipdensity;
5111                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5112                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5113                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5114                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5115                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5116                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5117                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5118                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5119                         attribedge1 = _mm_sub_ss(w0, w1);
5120                         attribedge2 = _mm_sub_ss(w2, w1);
5121                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5122                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5123                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5124                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5125                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5126                         _mm_store_ss(&triangle->w[0], attribxslope);
5127                         _mm_store_ss(&triangle->w[1], attribyslope);
5128                         _mm_store_ss(&triangle->w[2], attriborigin);
5129                         
5130                         clip0origin = 0;
5131                         clip0slope = 0;
5132                         clip0dir = 0;
5133                         if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
5134                         {
5135                                 float cliporigin, clipxslope, clipyslope;
5136                                 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5137                                 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5138                                 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5139                                 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5140                                 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5141                                 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5142                                 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
5143                                 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
5144                                 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
5145                                 if(clipxslope != 0)
5146                                 {
5147                                         clip0origin = -cliporigin/clipxslope;
5148                                         clip0slope = -clipyslope/clipxslope;
5149                                         clip0dir = clipxslope > 0 ? 1 : -1;
5150                                 }
5151                                 else if(clipyslope > 0)
5152                                 {
5153                                         clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5154                                         clip0slope = dpsoftrast.fb_width;
5155                                         clip0dir = -1;
5156                                 }
5157                                 else if(clipyslope < 0)
5158                                 {
5159                                         clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5160                                         clip0slope = -dpsoftrast.fb_width;
5161                                         clip0dir = -1;
5162                                 }
5163                                 else if(clip0origin < 0) continue;
5164                         }
5165
5166                         mipedgescale = _mm_setzero_ps();
5167                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5168                         {
5169                                 __m128 attrib0, attrib1, attrib2;
5170                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5171                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5172                                         break;
5173                                 arrays += numvertices*4;
5174                                 GENATTRIBS(attrib0, attrib1, attrib2);
5175                                 attriborigin = _mm_mul_ps(attrib1, w1);
5176                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5177                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5178                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5179                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5180                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5181                                 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5182                                 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5183                                 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5184                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5185                                 {
5186                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5187                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5188                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5189                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5190                                 }
5191                         }
5192
5193                         memset(triangle->mip, 0, sizeof(triangle->mip));
5194                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5195                         {
5196                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5197                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5198                                         break;
5199                                 texture = thread->texbound[texunit];
5200                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5201                                 {
5202                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5203                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5204                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5205                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5206                                         // this will be multiplied in the texturing routine by the texture resolution
5207                                         y = _mm_cvtss_si32(mipdensity);
5208                                         if (y > 0)
5209                                         {
5210                                                 y = (int)(log((float)y)*0.5f/M_LN2);
5211                                                 if (y > texture->mipmaps - 1)
5212                                                         y = texture->mipmaps - 1;
5213                                                 triangle->mip[texunit] = y;
5214                                         }
5215                                 }
5216                         }
5217                 }
5218         
5219                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5220                 for (; y < bandy;)
5221                 {
5222                         __m128 xcoords, xslope;
5223                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5224                         int yccmask = _mm_movemask_epi8(ycc);
5225                         int edge0p, edge0n, edge1p, edge1n;
5226                         int nexty;
5227                         float w, wslope;
5228                         float clip0;
5229                         if (numpoints == 4)
5230                         {
5231                                 switch(yccmask)
5232                                 {
5233                                 default:
5234                                 case 0xFFFF: /*0000*/ y = endy; continue;
5235                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5236                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5237                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5238                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5239                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5240                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5241                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5242                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5243                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5244                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5245                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5246                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5247                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5248                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5249                                 case 0x0000: /*1111*/ y++; continue;
5250                                 }
5251                         }
5252                         else
5253                         {
5254                                 switch(yccmask)
5255                                 {
5256                                 default:
5257                                 case 0xFFFF: /*000*/ y = endy; continue;
5258                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5259                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5260                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5261                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5262                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5263                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5264                                 case 0x0000: /*111*/ y++; continue;
5265                                 }
5266                         }
5267                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5268                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5269                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5270                         nexty = _mm_extract_epi16(ycc, 0);
5271                         if (nexty >= bandy) nexty = bandy-1;
5272                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5273                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5274                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5275                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5276                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5277                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5278                         {
5279                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5280                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5281                         }
5282                         clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5283                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5284                         {
5285                                 int startx, endx, offset;
5286                                 startx = _mm_cvtss_si32(xcoords);
5287                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5288                                 if (startx < minx) startx = minx;
5289                                 if (endx > maxx) endx = maxx;
5290                                 if (startx >= endx) continue;
5291
5292                                 if (clip0dir)
5293                                 {
5294                                         if (clip0dir > 0)
5295                                         {
5296                                                 if (startx < clip0) 
5297                                                 {
5298                                                         if(endx <= clip0) continue;
5299                                                         startx = (int)clip0;
5300                                                 }
5301                                         }
5302                                         else if (endx > clip0) 
5303                                         {
5304                                                 if(startx >= clip0) continue;
5305                                                 endx = (int)clip0;
5306                                         }
5307                                 }
5308                                                 
5309                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5310                                 {
5311                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5312                                         span->triangle = thread->numtriangles;
5313                                         span->x = offset;
5314                                         span->y = y;
5315                                         span->startx = 0;
5316                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5317                                         if (span->startx >= span->endx)
5318                                                 continue;
5319                                         wslope = triangle->w[0];
5320                                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5321                                         span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5322                                         span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5323                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5324                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
5325                                 }
5326                         }
5327                 }
5328
5329                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5330                 {
5331                         DPSOFTRAST_Draw_ProcessSpans(thread);
5332                         thread->numtriangles = 0;
5333                 }
5334         }
5335
5336         if (!ATOMIC_DECREMENT(command->refcount))
5337         {
5338                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5339                         MM_FREE(command->arrays);
5340         }
5341
5342         if (thread->numspans > 0 || thread->numtriangles > 0)
5343         {
5344                 DPSOFTRAST_Draw_ProcessSpans(thread);
5345                 thread->numtriangles = 0;
5346         }
5347 #endif
5348 }
5349
5350 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5351 {
5352         int i;
5353         int j;
5354         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5355         int datasize = 2*numvertices*sizeof(float[4]);
5356         DPSOFTRAST_Command_Draw *command;
5357         unsigned char *data;
5358         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5359         {
5360                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5361                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5362                         break;
5363                 datasize += numvertices*sizeof(float[4]);
5364         }
5365         if (element3s)
5366                 datasize += numtriangles*sizeof(unsigned short[3]);
5367         else if (element3i)
5368                 datasize += numtriangles*sizeof(int[3]);
5369         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5370         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5371         {
5372                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5373                 data = (unsigned char *)MM_CALLOC(datasize, 1);
5374         }
5375         else
5376         {
5377                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5378                 data = (unsigned char *)command + commandsize;
5379         }
5380         command->firstvertex = firstvertex;
5381         command->numvertices = numvertices;
5382         command->numtriangles = numtriangles;
5383         command->arrays = (float *)data;
5384         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5385         dpsoftrast.firstvertex = firstvertex;
5386         dpsoftrast.numvertices = numvertices;
5387         dpsoftrast.screencoord4f = (float *)data;
5388         data += numvertices*sizeof(float[4]);
5389         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5390         data += numvertices*sizeof(float[4]);
5391         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5392         {
5393                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5394                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5395                         break;
5396                 dpsoftrast.post_array4f[j] = (float *)data;
5397                 data += numvertices*sizeof(float[4]);
5398         }
5399         command->element3i = NULL;
5400         command->element3s = NULL;
5401         if (element3s)
5402         {
5403                 command->element3s = (unsigned short *)data;
5404                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5405         }
5406         else if (element3i)
5407         {
5408                 command->element3i = (int *)data;
5409                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5410         }
5411         return command;
5412 }
5413
5414 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5415 {
5416         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5417         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5418         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5419         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5420         if (command->starty >= command->endy)
5421         {
5422                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5423                         MM_FREE(command->arrays);
5424                 DPSOFTRAST_UndoCommand(command->commandsize);
5425                 return;
5426         }
5427         command->clipped = dpsoftrast.drawclipped;
5428         command->refcount = dpsoftrast.numthreads;
5429
5430         if (dpsoftrast.usethreads)
5431         {
5432                 int i;
5433                 DPSOFTRAST_Draw_SyncCommands();
5434                 for (i = 0; i < dpsoftrast.numthreads; i++)
5435                 {
5436                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5437                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5438                                 Thread_CondSignal(thread->drawcond);
5439                 }
5440         }
5441         else
5442         {
5443                 DPSOFTRAST_Draw_FlushThreads();
5444         }
5445 }
5446
5447 DEFCOMMAND(23, SetRenderTargets, int width; int height;)
5448 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5449 {
5450         thread->validate |= DPSOFTRAST_VALIDATE_FB;
5451 }
5452 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5453 {
5454         DPSOFTRAST_Command_SetRenderTargets *command;
5455         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5456                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5457                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5458                 DPSOFTRAST_Flush();
5459         dpsoftrast.fb_width = width;
5460         dpsoftrast.fb_height = height;
5461         dpsoftrast.fb_depthpixels = depthpixels;
5462         dpsoftrast.fb_colorpixels[0] = colorpixels0;
5463         dpsoftrast.fb_colorpixels[1] = colorpixels1;
5464         dpsoftrast.fb_colorpixels[2] = colorpixels2;
5465         dpsoftrast.fb_colorpixels[3] = colorpixels3;
5466         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5467         command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5468         command->width = width;
5469         command->height = height;
5470 }
5471  
5472 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5473 {
5474         int commandoffset = thread->commandoffset;
5475         while (commandoffset != endoffset)
5476         {
5477                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5478                 switch (command->opcode)
5479                 {
5480 #define INTERPCOMMAND(name) \
5481                 case DPSOFTRAST_OPCODE_##name : \
5482                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5483                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5484                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5485                                 commandoffset = 0; \
5486                         break;
5487                 INTERPCOMMAND(Viewport)
5488                 INTERPCOMMAND(ClearColor)
5489                 INTERPCOMMAND(ClearDepth)
5490                 INTERPCOMMAND(ColorMask)
5491                 INTERPCOMMAND(DepthTest)
5492                 INTERPCOMMAND(ScissorTest)
5493                 INTERPCOMMAND(Scissor)
5494                 INTERPCOMMAND(BlendFunc)
5495                 INTERPCOMMAND(BlendSubtract)
5496                 INTERPCOMMAND(DepthMask)
5497                 INTERPCOMMAND(DepthFunc)
5498                 INTERPCOMMAND(DepthRange)
5499                 INTERPCOMMAND(PolygonOffset)
5500                 INTERPCOMMAND(CullFace)
5501                 INTERPCOMMAND(SetTexture)
5502                 INTERPCOMMAND(SetShader)
5503                 INTERPCOMMAND(Uniform4f)
5504                 INTERPCOMMAND(UniformMatrix4f)
5505                 INTERPCOMMAND(Uniform1i)
5506                 INTERPCOMMAND(SetRenderTargets)
5507                 INTERPCOMMAND(ClipPlane)
5508
5509                 case DPSOFTRAST_OPCODE_Draw:
5510                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5511                         commandoffset += command->commandsize;
5512                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5513                                 commandoffset = 0;
5514                         thread->commandoffset = commandoffset;
5515                         break;
5516
5517                 case DPSOFTRAST_OPCODE_Reset:
5518                         commandoffset = 0;
5519                         break;
5520                 }
5521         }
5522         thread->commandoffset = commandoffset;
5523 }
5524
5525 static int DPSOFTRAST_Draw_Thread(void *data)
5526 {
5527         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5528         while(thread->index >= 0)
5529         {
5530                 if (thread->commandoffset != dpsoftrast.drawcommand)
5531                 {
5532                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5533                 }
5534                 else 
5535                 {
5536                         Thread_LockMutex(thread->drawmutex);
5537                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5538                         {
5539                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5540                                 thread->starving = true;
5541                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5542                                 thread->starving = false;
5543                         }
5544                         Thread_UnlockMutex(thread->drawmutex);
5545                 }
5546         }   
5547         return 0;
5548 }
5549
5550 static void DPSOFTRAST_Draw_FlushThreads(void)
5551 {
5552         DPSOFTRAST_State_Thread *thread;
5553         int i;
5554         DPSOFTRAST_Draw_SyncCommands();
5555         if (dpsoftrast.usethreads) 
5556         {
5557                 for (i = 0; i < dpsoftrast.numthreads; i++)
5558                 {
5559                         thread = &dpsoftrast.threads[i];
5560                         if (thread->commandoffset != dpsoftrast.drawcommand)
5561                         {
5562                                 Thread_LockMutex(thread->drawmutex);
5563                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5564                                         Thread_CondSignal(thread->drawcond);
5565                                 Thread_UnlockMutex(thread->drawmutex);
5566                         }
5567                 }
5568                 for (i = 0; i < dpsoftrast.numthreads; i++)
5569                 {
5570                         thread = &dpsoftrast.threads[i];
5571                         if (thread->commandoffset != dpsoftrast.drawcommand)
5572                         {
5573                                 Thread_LockMutex(thread->drawmutex);
5574                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5575                                 {
5576                                         thread->waiting = true;
5577                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5578                                         thread->waiting = false;
5579                                 }
5580                                 Thread_UnlockMutex(thread->drawmutex);
5581                         }
5582                 }
5583         }
5584         else
5585         {
5586                 for (i = 0; i < dpsoftrast.numthreads; i++)
5587                 {
5588                         thread = &dpsoftrast.threads[i];
5589                         if (thread->commandoffset != dpsoftrast.drawcommand)
5590                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5591                 }
5592         }
5593         dpsoftrast.commandpool.usedcommands = 0;
5594 }
5595
5596 void DPSOFTRAST_Flush(void)
5597 {
5598         DPSOFTRAST_Draw_FlushThreads();
5599 }
5600
5601 void DPSOFTRAST_Finish(void)
5602 {
5603         DPSOFTRAST_Flush();
5604 }
5605
5606 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5607 {
5608         int i;
5609         union
5610         {
5611                 int i;
5612                 unsigned char b[4];
5613         }
5614         u;
5615         u.i = 1;
5616         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5617         dpsoftrast.bigendian = u.b[3];
5618         dpsoftrast.fb_width = width;
5619         dpsoftrast.fb_height = height;
5620         dpsoftrast.fb_depthpixels = depthpixels;
5621         dpsoftrast.fb_colorpixels[0] = colorpixels;
5622         dpsoftrast.fb_colorpixels[1] = NULL;
5623         dpsoftrast.fb_colorpixels[1] = NULL;
5624         dpsoftrast.fb_colorpixels[1] = NULL;
5625         dpsoftrast.viewport[0] = 0;
5626         dpsoftrast.viewport[1] = 0;
5627         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5628         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5629         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5630         dpsoftrast.texture_firstfree = 1;
5631         dpsoftrast.texture_end = 1;
5632         dpsoftrast.texture_max = 0;
5633         dpsoftrast.color[0] = 1;
5634         dpsoftrast.color[1] = 1;
5635         dpsoftrast.color[2] = 1;
5636         dpsoftrast.color[3] = 1;
5637         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5638         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5639         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5640         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5641         for (i = 0; i < dpsoftrast.numthreads; i++)
5642         {
5643                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5644                 thread->index = i;
5645                 thread->cullface = GL_BACK;
5646         thread->colormask[0] = 1; 
5647                 thread->colormask[1] = 1;
5648                 thread->colormask[2] = 1;
5649                 thread->colormask[3] = 1;
5650                 thread->blendfunc[0] = GL_ONE;
5651                 thread->blendfunc[1] = GL_ZERO;
5652                 thread->depthmask = true;
5653                 thread->depthtest = true;
5654                 thread->depthfunc = GL_LEQUAL;
5655                 thread->scissortest = false;
5656                 thread->viewport[0] = 0;
5657                 thread->viewport[1] = 0;
5658                 thread->viewport[2] = dpsoftrast.fb_width;
5659                 thread->viewport[3] = dpsoftrast.fb_height;
5660                 thread->scissor[0] = 0;
5661                 thread->scissor[1] = 0;
5662                 thread->scissor[2] = dpsoftrast.fb_width;
5663                 thread->scissor[3] = dpsoftrast.fb_height;
5664                 thread->depthrange[0] = 0;
5665                 thread->depthrange[1] = 1;
5666                 thread->polygonoffset[0] = 0;
5667                 thread->polygonoffset[1] = 0;
5668                 thread->clipplane[0] = 0;
5669                 thread->clipplane[1] = 0;
5670                 thread->clipplane[2] = 0;
5671                 thread->clipplane[3] = 1;
5672         
5673                 thread->numspans = 0;
5674                 thread->numtriangles = 0;
5675                 thread->commandoffset = 0;
5676                 thread->waiting = false;
5677                 thread->starving = false;
5678            
5679                 thread->validate = -1;
5680                 DPSOFTRAST_Validate(thread, -1);
5681  
5682                 if (dpsoftrast.usethreads)
5683                 {
5684                         thread->waitcond = Thread_CreateCond();
5685                         thread->drawcond = Thread_CreateCond();
5686                         thread->drawmutex = Thread_CreateMutex();
5687                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5688                 }
5689         }
5690         return 0;
5691 }
5692
5693 void DPSOFTRAST_Shutdown(void)
5694 {
5695         int i;
5696         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5697         {
5698                 DPSOFTRAST_State_Thread *thread;
5699                 for (i = 0; i < dpsoftrast.numthreads; i++)
5700                 {
5701                         thread = &dpsoftrast.threads[i];
5702                         Thread_LockMutex(thread->drawmutex);
5703                         thread->index = -1;
5704                         Thread_CondSignal(thread->drawcond);
5705                         Thread_UnlockMutex(thread->drawmutex);
5706                         Thread_WaitThread(thread->thread, 0);
5707                         Thread_DestroyCond(thread->waitcond);
5708                         Thread_DestroyCond(thread->drawcond);
5709                         Thread_DestroyMutex(thread->drawmutex);
5710                 }
5711         }
5712         for (i = 0;i < dpsoftrast.texture_end;i++)
5713                 if (dpsoftrast.texture[i].bytes)
5714                         MM_FREE(dpsoftrast.texture[i].bytes);
5715         if (dpsoftrast.texture)
5716                 free(dpsoftrast.texture);
5717         if (dpsoftrast.threads)
5718                 MM_FREE(dpsoftrast.threads);
5719         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5720 }
5721