]> git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
server: ignore out of order prespawn/spawn/begin commands
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifdef _MSC_VER
10 #pragma warning(disable : 4324)
11 #endif
12
13 #ifndef __cplusplus
14 typedef qboolean bool;
15 #endif
16
17 #define ALIGN_SIZE 16
18 #define ATOMIC_SIZE 4
19
20 #ifdef SSE_POSSIBLE
21         #if defined(__APPLE__)
22                 #include <libkern/OSAtomic.h>
23                 #define ALIGN(var) var __attribute__((__aligned__(16)))
24                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
25                 #define MEMORY_BARRIER (_mm_sfence())
26                 #define ATOMIC_COUNTER volatile int32_t 
27                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30         #elif defined(__GNUC__) && defined(WIN32)
31                 #define ALIGN(var) var __attribute__((__aligned__(16)))
32                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
33                 #define MEMORY_BARRIER (_mm_sfence())
34                 //(__sync_synchronize())
35                 #define ATOMIC_COUNTER volatile LONG
36                 // this LONG * cast serves to fix an issue with broken mingw
37                 // packages on Ubuntu; these only declare the function to take
38                 // a LONG *, causing a compile error here. This seems to be
39                 // error- and warn-free on platforms that DO declare
40                 // InterlockedIncrement correctly, like mingw on Windows.
41                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement((LONG *) &(counter)))
42                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement((LONG *) &(counter)))
43                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd((LONG *) &(counter), (val)))
44         #elif defined(__GNUC__)
45                 #define ALIGN(var) var __attribute__((__aligned__(16)))
46                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
47                 #define MEMORY_BARRIER (_mm_sfence())
48                 //(__sync_synchronize())
49                 #define ATOMIC_COUNTER volatile int
50                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
51                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
52                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
53         #elif defined(_MSC_VER)
54                 #define ALIGN(var) __declspec(align(16)) var
55                 #define ATOMIC(var) __declspec(align(4)) var
56                 #define MEMORY_BARRIER (_mm_sfence())
57                 //(MemoryBarrier())
58                 #define ATOMIC_COUNTER volatile LONG
59                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
60                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
61                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
62         #endif
63 #endif
64
65 #ifndef ALIGN
66 #define ALIGN(var) var
67 #endif
68 #ifndef ATOMIC
69 #define ATOMIC(var) var
70 #endif
71 #ifndef MEMORY_BARRIER
72 #define MEMORY_BARRIER ((void)0)
73 #endif
74 #ifndef ATOMIC_COUNTER
75 #define ATOMIC_COUNTER int
76 #endif
77 #ifndef ATOMIC_INCREMENT
78 #define ATOMIC_INCREMENT(counter) (++(counter))
79 #endif
80 #ifndef ATOMIC_DECREMENT
81 #define ATOMIC_DECREMENT(counter) (--(counter))
82 #endif
83 #ifndef ATOMIC_ADD
84 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
85 #endif
86
87 #ifdef SSE_POSSIBLE
88 #include <emmintrin.h>
89
90 #if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6) && !defined(__clang__)
91         #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
92 #endif
93
94 #define MM_MALLOC(size) _mm_malloc(size, ALIGN_SIZE)
95
96 static void *MM_CALLOC(size_t nmemb, size_t size)
97 {
98         void *ptr = _mm_malloc(nmemb*size, ALIGN_SIZE);
99         if (ptr != NULL) memset(ptr, 0, nmemb*size);
100         return ptr;
101 }
102
103 #define MM_FREE _mm_free
104 #else
105 #define MM_MALLOC(size) malloc(size)
106 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
107 #define MM_FREE free
108 #endif
109
110 typedef enum DPSOFTRAST_ARRAY_e
111 {
112         DPSOFTRAST_ARRAY_POSITION,
113         DPSOFTRAST_ARRAY_COLOR,
114         DPSOFTRAST_ARRAY_TEXCOORD0,
115         DPSOFTRAST_ARRAY_TEXCOORD1,
116         DPSOFTRAST_ARRAY_TEXCOORD2,
117         DPSOFTRAST_ARRAY_TEXCOORD3,
118         DPSOFTRAST_ARRAY_TEXCOORD4,
119         DPSOFTRAST_ARRAY_TEXCOORD5,
120         DPSOFTRAST_ARRAY_TEXCOORD6,
121         DPSOFTRAST_ARRAY_TEXCOORD7,
122         DPSOFTRAST_ARRAY_TOTAL
123 }
124 DPSOFTRAST_ARRAY;
125
126 typedef struct DPSOFTRAST_Texture_s
127 {
128         int flags;
129         int width;
130         int height;
131         int depth;
132         int sides;
133         DPSOFTRAST_TEXTURE_FILTER filter;
134         int mipmaps;
135         int size;
136         ATOMIC_COUNTER binds;
137         unsigned char *bytes;
138         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
139 }
140 DPSOFTRAST_Texture;
141
142 #define COMMAND_SIZE ALIGN_SIZE
143 #define COMMAND_ALIGN(var) ALIGN(var)
144
145 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
146 {
147         unsigned char opcode;
148         unsigned short commandsize;
149 }
150 DPSOFTRAST_Command);
151
152 enum { DPSOFTRAST_OPCODE_Reset = 0 };
153
154 #define DEFCOMMAND(opcodeval, name, fields) \
155         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
156         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
157         { \
158                 unsigned char opcode; \
159                 unsigned short commandsize; \
160                 fields \
161         } DPSOFTRAST_Command_##name );
162
163 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
164 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
165
166 typedef ALIGN(struct DPSOFTRAST_State_Command_Pool_s
167 {
168         int freecommand;
169         int usedcommands;
170         ALIGN(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
171 }
172 DPSOFTRAST_State_Command_Pool);
173
174 typedef ALIGN(struct DPSOFTRAST_State_Triangle_s
175 {
176         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
177         float w[3];
178         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
179 }
180 DPSOFTRAST_State_Triangle);
181
182 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
183         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
184         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
185                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
186                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
187 }
188 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
189         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
190         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
191         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
192         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
193         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
194         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
195         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
196         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
197 }
198                                         
199 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
200
201 typedef ALIGN(struct DPSOFTRAST_State_Span_s
202 {
203         int triangle; // triangle this span was generated by
204         int x; // framebuffer x coord
205         int y; // framebuffer y coord
206         int startx; // usable range (according to pixelmask)
207         int endx; // usable range (according to pixelmask)
208         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
209         int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
210         int depthslope; // depthbuffer value pixel delta
211 }
212 DPSOFTRAST_State_Span);
213
214 #define DPSOFTRAST_DRAW_MAXSPANS 1024
215 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
216 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
217
218 #define DPSOFTRAST_VALIDATE_FB 1
219 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
220 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
221 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
222
223 typedef enum DPSOFTRAST_BLENDMODE_e
224 {
225         DPSOFTRAST_BLENDMODE_OPAQUE,
226         DPSOFTRAST_BLENDMODE_ALPHA,
227         DPSOFTRAST_BLENDMODE_ADDALPHA,
228         DPSOFTRAST_BLENDMODE_ADD,
229         DPSOFTRAST_BLENDMODE_INVMOD,
230         DPSOFTRAST_BLENDMODE_MUL,
231         DPSOFTRAST_BLENDMODE_MUL2,
232         DPSOFTRAST_BLENDMODE_SUBALPHA,
233         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
234         DPSOFTRAST_BLENDMODE_INVADD,
235         DPSOFTRAST_BLENDMODE_TOTAL
236 }
237 DPSOFTRAST_BLENDMODE;
238
239 typedef ALIGN(struct DPSOFTRAST_State_Thread_s
240 {
241         void *thread;
242         int index;
243         
244         int cullface;
245         int colormask[4];
246         int blendfunc[2];
247         int blendsubtract;
248         int depthmask;
249         int depthtest;
250         int depthfunc;
251         int scissortest;
252         int viewport[4];
253         int scissor[4];
254         float depthrange[2];
255         float polygonoffset[2];
256         float clipplane[4];
257         ALIGN(float fb_clipplane[4]);
258
259         int shader_mode;
260         int shader_permutation;
261         int shader_exactspecularmath;
262
263         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
264         
265         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
266         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
267
268         // DPSOFTRAST_VALIDATE_ flags
269         int validate;
270
271         // derived values (DPSOFTRAST_VALIDATE_FB)
272         int fb_colormask;
273         int fb_scissor[4];
274         ALIGN(float fb_viewportcenter[4]);
275         ALIGN(float fb_viewportscale[4]);
276
277         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
278         int fb_depthfunc;
279
280         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
281         int fb_blendmode;
282
283         // band boundaries
284         int miny1;
285         int maxy1;
286         int miny2;
287         int maxy2;
288
289         ATOMIC(volatile int commandoffset);
290
291         volatile bool waiting;
292         volatile bool starving;
293         void *waitcond;
294         void *drawcond;
295         void *drawmutex;
296
297         int numspans;
298         int numtriangles;
299         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
300         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
301         unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
302 }
303 DPSOFTRAST_State_Thread);
304
305 typedef ALIGN(struct DPSOFTRAST_State_s
306 {
307         int fb_width;
308         int fb_height;
309         unsigned int *fb_depthpixels;
310         unsigned int *fb_colorpixels[4];
311
312         int viewport[4];
313         ALIGN(float fb_viewportcenter[4]);
314         ALIGN(float fb_viewportscale[4]);
315
316         float color[4];
317         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
318         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
319
320         const float *pointer_vertex3f;
321         const float *pointer_color4f;
322         const unsigned char *pointer_color4ub;
323         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
324         int stride_vertex;
325         int stride_color;
326         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
327         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
328         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
329
330         int firstvertex;
331         int numvertices;
332         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
333         float *screencoord4f;
334         int drawstarty;
335         int drawendy;
336         int drawclipped;
337         
338         int shader_mode;
339         int shader_permutation;
340         int shader_exactspecularmath;
341
342         int texture_max;
343         int texture_end;
344         int texture_firstfree;
345         DPSOFTRAST_Texture *texture;
346
347         int bigendian;
348
349         // error reporting
350         const char *errorstring;
351
352         bool usethreads;
353         int interlace;
354         int numthreads;
355         DPSOFTRAST_State_Thread *threads;
356
357         ATOMIC(volatile int drawcommand);
358
359         DPSOFTRAST_State_Command_Pool commandpool;
360 }
361 DPSOFTRAST_State);
362
363 DPSOFTRAST_State dpsoftrast;
364
365 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
366 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
367 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
368 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
369
370 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
371 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
372
373 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
374 {
375         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
376         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
377         fb_viewportcenter[3] = 0.5f;
378         fb_viewportcenter[0] = 0.0f;
379         fb_viewportscale[1] = 0.5f * viewport[2];
380         fb_viewportscale[2] = -0.5f * viewport[3];
381         fb_viewportscale[3] = 0.5f;
382         fb_viewportscale[0] = 1.0f;
383 }
384
385 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
386 {
387         if (dpsoftrast.interlace)
388         {
389                 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
390                 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
391                 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
392                 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
393         }
394         else
395         {
396                 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
397                 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
398         }
399 }
400
401 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
402 {
403         thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
404         thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
405         thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
406         thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
407         thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
408 }
409
410 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
411 {
412         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
413         // and viewport projection values
414         int x1, x2;
415         int y1, y2;
416         x1 = thread->scissor[0];
417         x2 = thread->scissor[0] + thread->scissor[2];
418         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
419         y2 = dpsoftrast.fb_height - thread->scissor[1];
420         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
421         if (x1 < 0) x1 = 0;
422         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
423         if (y1 < 0) y1 = 0;
424         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
425         thread->fb_scissor[0] = x1;
426         thread->fb_scissor[1] = y1;
427         thread->fb_scissor[2] = x2 - x1;
428         thread->fb_scissor[3] = y2 - y1;
429
430         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
431         DPSOFTRAST_RecalcClipPlane(thread);
432         DPSOFTRAST_RecalcThread(thread);
433 }
434
435 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
436 {
437         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
438 }
439
440 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
441 {
442         if (thread->blendsubtract)
443         {
444                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
445                 {
446                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
447                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
448                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
449                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
450                 }
451         }
452         else
453         {       
454                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
455                 {
456                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
457                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
458                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
459                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
460                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
461                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
462                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
463                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
464                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
465                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
466                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
467                 }
468         }
469 }
470
471 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
472
473 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
474 {
475         mask &= thread->validate;
476         if (!mask)
477                 return;
478         if (mask & DPSOFTRAST_VALIDATE_FB)
479         {
480                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
481                 DPSOFTRAST_RecalcFB(thread);
482         }
483         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
484         {
485                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
486                 DPSOFTRAST_RecalcDepthFunc(thread);
487         }
488         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
489         {
490                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
491                 DPSOFTRAST_RecalcBlendFunc(thread);
492         }
493 }
494
495 static DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
496 {
497         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
498                 return &dpsoftrast.texture[index];
499         return NULL;
500 }
501
502 static void DPSOFTRAST_Texture_Grow(void)
503 {
504         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
505         DPSOFTRAST_State_Thread *thread;
506         int i;
507         int j;
508         DPSOFTRAST_Flush();
509         // expand texture array as needed
510         if (dpsoftrast.texture_max < 1024)
511                 dpsoftrast.texture_max = 1024;
512         else
513                 dpsoftrast.texture_max *= 2;
514         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
515         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
516                 if (dpsoftrast.texbound[i])
517                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
518         for (j = 0; j < dpsoftrast.numthreads; j++)
519         {
520                 thread = &dpsoftrast.threads[j];
521                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
522                         if (thread->texbound[i])
523                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
524         }
525 }
526
527 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
528 {
529         int w;
530         int h;
531         int d;
532         int size;
533         int s;
534         int texnum;
535         int mipmaps;
536         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
537         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
538         DPSOFTRAST_Texture *texture;
539         if (width*height*depth < 1)
540         {
541                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
542                 return 0;
543         }
544         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
545         {
546                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
547                 return 0;
548         }
549         switch(texformat)
550         {
551         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
552         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
553         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
554                 break;
555         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
556                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
557                 {
558                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
559                         return 0;
560                 }
561                 if (depth != 1)
562                 {
563                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
564                         return 0;
565                 }
566                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
567                 {
568                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
569                         return 0;
570                 }
571                 break;
572         }
573         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
574         {
575                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
576                 return 0;
577         }
578         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
579         {
580                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
581                 return 0;
582         }
583         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
584         {
585                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
586                 return 0;
587         }
588         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
589         {
590                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
591                 return 0;
592         }
593         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
594         {
595                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
596                 return 0;
597         }
598         // find first empty slot in texture array
599         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
600                 if (!dpsoftrast.texture[texnum].bytes)
601                         break;
602         dpsoftrast.texture_firstfree = texnum + 1;
603         if (dpsoftrast.texture_max <= texnum)
604                 DPSOFTRAST_Texture_Grow();
605         if (dpsoftrast.texture_end <= texnum)
606                 dpsoftrast.texture_end = texnum + 1;
607         texture = &dpsoftrast.texture[texnum];
608         memset(texture, 0, sizeof(*texture));
609         texture->flags = flags;
610         texture->width = width;
611         texture->height = height;
612         texture->depth = depth;
613         texture->sides = sides;
614         texture->binds = 0;
615         w = width;
616         h = height;
617         d = depth;
618         size = 0;
619         mipmaps = 0;
620         for (;;)
621         {
622                 s = w * h * d * sides * 4;
623                 texture->mipmap[mipmaps][0] = size;
624                 texture->mipmap[mipmaps][1] = s;
625                 texture->mipmap[mipmaps][2] = w;
626                 texture->mipmap[mipmaps][3] = h;
627                 texture->mipmap[mipmaps][4] = d;
628                 size += s;
629                 mipmaps++;
630                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
631                         break;
632                 if (w > 1) w >>= 1;
633                 if (h > 1) h >>= 1;
634                 if (d > 1) d >>= 1;
635         }
636         texture->mipmaps = mipmaps;
637         texture->size = size;
638
639         // allocate the pixels now
640         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
641
642         return texnum;
643 }
644 void DPSOFTRAST_Texture_Free(int index)
645 {
646         DPSOFTRAST_Texture *texture;
647         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
648         if (texture->binds)
649                 DPSOFTRAST_Flush();
650         if (texture->bytes)
651                 MM_FREE(texture->bytes);
652         texture->bytes = NULL;
653         memset(texture, 0, sizeof(*texture));
654         // adjust the free range and used range
655         if (dpsoftrast.texture_firstfree > index)
656                 dpsoftrast.texture_firstfree = index;
657         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
658                 dpsoftrast.texture_end--;
659 }
660 static void DPSOFTRAST_Texture_CalculateMipmaps(int index)
661 {
662         int i, x, y, z, w, layer0, layer1, row0, row1;
663         unsigned char *o, *i0, *i1, *i2, *i3;
664         DPSOFTRAST_Texture *texture;
665         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
666         if (texture->mipmaps <= 1)
667                 return;
668         for (i = 1;i < texture->mipmaps;i++)
669         {
670                 for (z = 0;z < texture->mipmap[i][4];z++)
671                 {
672                         layer0 = z*2;
673                         layer1 = z*2+1;
674                         if (layer1 >= texture->mipmap[i-1][4])
675                                 layer1 = texture->mipmap[i-1][4]-1;
676                         for (y = 0;y < texture->mipmap[i][3];y++)
677                         {
678                                 row0 = y*2;
679                                 row1 = y*2+1;
680                                 if (row1 >= texture->mipmap[i-1][3])
681                                         row1 = texture->mipmap[i-1][3]-1;
682                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
683                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
684                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
685                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
686                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
687                                 w = texture->mipmap[i][2];
688                                 if (layer1 > layer0)
689                                 {
690                                         if (texture->mipmap[i-1][2] > 1)
691                                         {
692                                                 // average 3D texture
693                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
694                                                 {
695                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
696                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
697                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
698                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
699                                                 }
700                                         }
701                                         else
702                                         {
703                                                 // average 3D mipmap with parent width == 1
704                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
705                                                 {
706                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
707                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
708                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
709                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
710                                                 }
711                                         }
712                                 }
713                                 else
714                                 {
715                                         if (texture->mipmap[i-1][2] > 1)
716                                         {
717                                                 // average 2D texture (common case)
718                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
719                                                 {
720                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
721                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
722                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
723                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
724                                                 }
725                                         }
726                                         else
727                                         {
728                                                 // 2D texture with parent width == 1
729                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
730                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
731                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
732                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
733                                         }
734                                 }
735                         }
736                 }
737         }
738 }
739 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
740 {
741         DPSOFTRAST_Texture *texture;
742         unsigned char *dst;
743         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
744         if (texture->binds)
745                 DPSOFTRAST_Flush();
746         if (pixels)
747         {
748                 dst = texture->bytes + texture->mipmap[0][1] +(-blocky * texture->mipmap[0][2] + blockx) * 4;
749                 while (blockheight > 0)
750                 {
751                         dst -= texture->mipmap[0][2] * 4;
752                         memcpy(dst, pixels, blockwidth * 4);
753                         pixels += blockwidth * 4;
754                         blockheight--;
755                 }
756         }
757         DPSOFTRAST_Texture_CalculateMipmaps(index);
758 }
759 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
760 {
761         DPSOFTRAST_Texture *texture;
762         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
763         if (texture->binds)
764                 DPSOFTRAST_Flush();
765         if (pixels)
766         {
767                 int i, stride = texture->mipmap[0][2]*4;
768                 unsigned char *dst = texture->bytes + texture->mipmap[0][1];
769                 for (i = texture->mipmap[0][3];i > 0;i--)
770                 {
771                         dst -= stride;
772                         memcpy(dst, pixels, stride);
773                         pixels += stride;
774                 }
775         }
776         DPSOFTRAST_Texture_CalculateMipmaps(index);
777 }
778 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
779 {
780         DPSOFTRAST_Texture *texture;
781         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
782         return texture->mipmap[mip][2];
783 }
784 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
785 {
786         DPSOFTRAST_Texture *texture;
787         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
788         return texture->mipmap[mip][3];
789 }
790 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
791 {
792         DPSOFTRAST_Texture *texture;
793         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
794         return texture->mipmap[mip][4];
795 }
796 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
797 {
798         DPSOFTRAST_Texture *texture;
799         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
800         if (texture->binds)
801                 DPSOFTRAST_Flush();
802         return texture->bytes + texture->mipmap[mip][0];
803 }
804 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
805 {
806         DPSOFTRAST_Texture *texture;
807         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
808         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
809         {
810                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
811                 return;
812         }
813         if (texture->binds)
814                 DPSOFTRAST_Flush();
815         texture->filter = filter;
816 }
817
818 static void DPSOFTRAST_Draw_FlushThreads(void);
819
820 static void DPSOFTRAST_Draw_SyncCommands(void)
821 {
822         if(dpsoftrast.usethreads) MEMORY_BARRIER;
823         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
824 }
825
826 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
827 {
828         DPSOFTRAST_State_Thread *thread;
829         int i;
830         int freecommand = dpsoftrast.commandpool.freecommand;
831         int usedcommands = dpsoftrast.commandpool.usedcommands;
832         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
833                 return;
834         DPSOFTRAST_Draw_SyncCommands();
835         for(;;)
836         {
837                 int waitindex = -1;
838                 int commandoffset;
839                 usedcommands = 0;
840                 for (i = 0; i < dpsoftrast.numthreads; i++)
841                 {
842                         thread = &dpsoftrast.threads[i]; 
843                         commandoffset = freecommand - thread->commandoffset;
844                         if (commandoffset < 0)
845                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
846                         if (commandoffset > usedcommands)
847                         {
848                                 waitindex = i;
849                                 usedcommands = commandoffset;
850                         }
851                 }
852                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
853                         break;
854                 thread = &dpsoftrast.threads[waitindex];
855                 Thread_LockMutex(thread->drawmutex);
856                 if (thread->commandoffset != dpsoftrast.drawcommand)
857                 {
858                         thread->waiting = true;
859                         if (thread->starving) Thread_CondSignal(thread->drawcond);
860                         Thread_CondWait(thread->waitcond, thread->drawmutex);
861                         thread->waiting = false;
862                 }
863                 Thread_UnlockMutex(thread->drawmutex);
864         }
865         dpsoftrast.commandpool.usedcommands = usedcommands;
866 }
867
868 #define DPSOFTRAST_ALIGNCOMMAND(size) \
869         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
870 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
871         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
872
873 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
874 {
875         DPSOFTRAST_Command *command;
876         int freecommand = dpsoftrast.commandpool.freecommand;
877         int usedcommands = dpsoftrast.commandpool.usedcommands;
878         int extra = sizeof(DPSOFTRAST_Command);
879         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
880                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
881         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
882         {
883                 if (dpsoftrast.usethreads)
884                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
885                 else
886                         DPSOFTRAST_Draw_FlushThreads();
887                 freecommand = dpsoftrast.commandpool.freecommand;
888                 usedcommands = dpsoftrast.commandpool.usedcommands;
889         }
890         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
891         {
892                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
893                 command->opcode = DPSOFTRAST_OPCODE_Reset;
894                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
895                 freecommand = 0;
896         }
897         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
898         command->opcode = opcode;
899         command->commandsize = size;
900         freecommand += size;
901         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
902                 freecommand = 0;
903         dpsoftrast.commandpool.freecommand = freecommand;
904         dpsoftrast.commandpool.usedcommands = usedcommands + size;
905         return command;
906 }
907
908 static void DPSOFTRAST_UndoCommand(int size)
909 {
910         int freecommand = dpsoftrast.commandpool.freecommand;
911         int usedcommands = dpsoftrast.commandpool.usedcommands;
912         freecommand -= size;
913         if (freecommand < 0)
914                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
915         usedcommands -= size;
916         dpsoftrast.commandpool.freecommand = freecommand;
917         dpsoftrast.commandpool.usedcommands = usedcommands;
918 }
919                 
920 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
921 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
922 {
923         thread->viewport[0] = command->x;
924         thread->viewport[1] = command->y;
925         thread->viewport[2] = command->width;
926         thread->viewport[3] = command->height;
927         thread->validate |= DPSOFTRAST_VALIDATE_FB;
928 }
929 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
930 {
931         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
932         command->x = x;
933         command->y = y;
934         command->width = width;
935         command->height = height;
936
937         dpsoftrast.viewport[0] = x;
938         dpsoftrast.viewport[1] = y;
939         dpsoftrast.viewport[2] = width;
940         dpsoftrast.viewport[3] = height;
941         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
942 }
943
944 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
945 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
946 {
947         int i, x1, y1, x2, y2, w, h, x, y;
948         int miny1, maxy1, miny2, maxy2;
949         int bandy;
950         unsigned int *p;
951         unsigned int c;
952         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
953         miny1 = thread->miny1;
954         maxy1 = thread->maxy1;
955         miny2 = thread->miny2;
956         maxy2 = thread->maxy2;
957         x1 = thread->fb_scissor[0];
958         y1 = thread->fb_scissor[1];
959         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
960         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
961         if (y1 < miny1) y1 = miny1;
962         if (y2 > maxy2) y2 = maxy2;
963         w = x2 - x1;
964         h = y2 - y1;
965         if (w < 1 || h < 1)
966                 return;
967         // FIXME: honor fb_colormask?
968         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
969         for (i = 0;i < 4;i++)
970         {
971                 if (!dpsoftrast.fb_colorpixels[i])
972                         continue;
973                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
974                 for (;y < bandy;y++)
975                 {
976                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
977                         for (x = x1;x < x2;x++)
978                                 p[x] = c;
979                 }
980         }
981 }
982 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
983 {
984         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
985         command->r = r;
986         command->g = g;
987         command->b = b;
988         command->a = a;
989 }
990
991 DEFCOMMAND(3, ClearDepth, float depth;)
992 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
993 {
994         int x1, y1, x2, y2, w, h, x, y;
995         int miny1, maxy1, miny2, maxy2;
996         int bandy;
997         unsigned int *p;
998         unsigned int c;
999         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
1000         miny1 = thread->miny1;
1001         maxy1 = thread->maxy1;
1002         miny2 = thread->miny2;
1003         maxy2 = thread->maxy2;
1004         x1 = thread->fb_scissor[0];
1005         y1 = thread->fb_scissor[1];
1006         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
1007         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
1008         if (y1 < miny1) y1 = miny1;
1009         if (y2 > maxy2) y2 = maxy2;
1010         w = x2 - x1;
1011         h = y2 - y1;
1012         if (w < 1 || h < 1)
1013                 return;
1014         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
1015         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
1016         for (;y < bandy;y++)
1017         {
1018                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
1019                 for (x = x1;x < x2;x++)
1020                         p[x] = c;
1021         }
1022 }
1023 void DPSOFTRAST_ClearDepth(float d)
1024 {
1025         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1026         command->depth = d;
1027 }
1028
1029 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1030 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1031 {
1032         thread->colormask[0] = command->r != 0;
1033         thread->colormask[1] = command->g != 0;
1034         thread->colormask[2] = command->b != 0;
1035         thread->colormask[3] = command->a != 0;
1036         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1037 }
1038 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1039 {
1040         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1041         command->r = r;
1042         command->g = g;
1043         command->b = b;
1044         command->a = a;
1045 }
1046
1047 DEFCOMMAND(5, DepthTest, int enable;)
1048 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1049 {
1050         thread->depthtest = command->enable;
1051         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1052 }
1053 void DPSOFTRAST_DepthTest(int enable)
1054 {
1055         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1056         command->enable = enable;
1057 }
1058
1059 DEFCOMMAND(6, ScissorTest, int enable;)
1060 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1061 {
1062         thread->scissortest = command->enable;
1063         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1064 }
1065 void DPSOFTRAST_ScissorTest(int enable)
1066 {
1067         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1068         command->enable = enable;
1069 }
1070
1071 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1072 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1073 {
1074         thread->scissor[0] = command->x;
1075         thread->scissor[1] = command->y;
1076         thread->scissor[2] = command->width;
1077         thread->scissor[3] = command->height;
1078         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1079 }
1080 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1081 {
1082         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1083         command->x = x;
1084         command->y = y;
1085         command->width = width;
1086         command->height = height;
1087 }
1088
1089 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1090 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1091 {
1092         thread->blendfunc[0] = command->sfactor;
1093         thread->blendfunc[1] = command->dfactor;
1094         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1095 }
1096 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1097 {
1098         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1099         command->sfactor = sfactor;
1100         command->dfactor = dfactor;
1101 }
1102
1103 DEFCOMMAND(9, BlendSubtract, int enable;)
1104 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1105 {
1106         thread->blendsubtract = command->enable;
1107         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1108 }
1109 void DPSOFTRAST_BlendSubtract(int enable)
1110 {
1111         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1112         command->enable = enable;
1113 }
1114
1115 DEFCOMMAND(10, DepthMask, int enable;)
1116 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1117 {
1118         thread->depthmask = command->enable;
1119 }
1120 void DPSOFTRAST_DepthMask(int enable)
1121 {
1122         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1123         command->enable = enable;
1124 }
1125
1126 DEFCOMMAND(11, DepthFunc, int func;)
1127 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1128 {
1129         thread->depthfunc = command->func;
1130 }
1131 void DPSOFTRAST_DepthFunc(int func)
1132 {
1133         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1134         command->func = func;
1135 }
1136
1137 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1138 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1139 {
1140         thread->depthrange[0] = command->nearval;
1141         thread->depthrange[1] = command->farval;
1142 }
1143 void DPSOFTRAST_DepthRange(float nearval, float farval)
1144 {
1145         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1146         command->nearval = nearval;
1147         command->farval = farval;
1148 }
1149
1150 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1151 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1152 {
1153         thread->polygonoffset[0] = command->alongnormal;
1154         thread->polygonoffset[1] = command->intoview;
1155 }
1156 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1157 {
1158         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1159         command->alongnormal = alongnormal;
1160         command->intoview = intoview;
1161 }
1162
1163 DEFCOMMAND(14, CullFace, int mode;)
1164 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1165 {
1166         thread->cullface = command->mode;
1167 }
1168 void DPSOFTRAST_CullFace(int mode)
1169 {
1170         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1171         command->mode = mode;
1172 }
1173
1174 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1175 {
1176         dpsoftrast.color[0] = r;
1177         dpsoftrast.color[1] = g;
1178         dpsoftrast.color[2] = b;
1179         dpsoftrast.color[3] = a;
1180 }
1181
1182 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1183 {
1184         int outstride = blockwidth * 4;
1185         int instride = dpsoftrast.fb_width * 4;
1186         int bx1 = blockx;
1187         int by1 = blocky;
1188         int bx2 = blockx + blockwidth;
1189         int by2 = blocky + blockheight;
1190         int bw;
1191         int x;
1192         int y;
1193         unsigned char *inpixels;
1194         unsigned char *b;
1195         unsigned char *o;
1196         DPSOFTRAST_Flush();
1197         if (bx1 < 0) bx1 = 0;
1198         if (by1 < 0) by1 = 0;
1199         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1200         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1201         bw = bx2 - bx1;
1202         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1203         if (dpsoftrast.bigendian)
1204         {
1205                 for (y = by1;y < by2;y++)
1206                 {
1207                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1208                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1209                         for (x = bx1;x < bx2;x++)
1210                         {
1211                                 o[0] = b[3];
1212                                 o[1] = b[2];
1213                                 o[2] = b[1];
1214                                 o[3] = b[0];
1215                                 o += 4;
1216                                 b += 4;
1217                         }
1218                 }
1219         }
1220         else
1221         {
1222                 for (y = by1;y < by2;y++)
1223                 {
1224                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1225                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1226                         memcpy(o, b, bw*4);
1227                 }
1228         }
1229
1230 }
1231 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1232 {
1233         int tx1 = tx;
1234         int ty1 = ty;
1235         int tx2 = tx + width;
1236         int ty2 = ty + height;
1237         int sx1 = sx;
1238         int sy1 = sy;
1239         int sx2 = sx + width;
1240         int sy2 = sy + height;
1241         int swidth;
1242         int sheight;
1243         int twidth;
1244         int theight;
1245         int sw;
1246         int sh;
1247         int tw;
1248         int th;
1249         int y;
1250         unsigned int *spixels;
1251         unsigned int *tpixels;
1252         DPSOFTRAST_Texture *texture;
1253         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1254         if (mip < 0 || mip >= texture->mipmaps) return;
1255         DPSOFTRAST_Flush();
1256         spixels = dpsoftrast.fb_colorpixels[0];
1257         swidth = dpsoftrast.fb_width;
1258         sheight = dpsoftrast.fb_height;
1259         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1260         twidth = texture->mipmap[mip][2];
1261         theight = texture->mipmap[mip][3];
1262         if (tx1 < 0) tx1 = 0;
1263         if (ty1 < 0) ty1 = 0;
1264         if (tx2 > twidth) tx2 = twidth;
1265         if (ty2 > theight) ty2 = theight;
1266         if (sx1 < 0) sx1 = 0;
1267         if (sy1 < 0) sy1 = 0;
1268         if (sx2 > swidth) sx2 = swidth;
1269         if (sy2 > sheight) sy2 = sheight;
1270         tw = tx2 - tx1;
1271         th = ty2 - ty1;
1272         sw = sx2 - sx1;
1273         sh = sy2 - sy1;
1274         if (tw > sw) tw = sw;
1275         if (th > sh) th = sh;
1276         if (tw < 1 || th < 1)
1277                 return;
1278         sy1 = sheight - sy1 - th;
1279         ty1 = theight - ty1 - th;
1280         for (y = 0;y < th;y++)
1281                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1282         if (texture->mipmaps > 1)
1283                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1284 }
1285
1286 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1287 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1288 {
1289         if (thread->texbound[command->unitnum])
1290                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1291         thread->texbound[command->unitnum] = command->texture;
1292 }
1293 void DPSOFTRAST_SetTexture(int unitnum, int index)
1294 {
1295         DPSOFTRAST_Command_SetTexture *command;
1296         DPSOFTRAST_Texture *texture;
1297         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1298         {
1299                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1300                 return;
1301         }
1302         texture = DPSOFTRAST_Texture_GetByIndex(index);
1303         if (index && !texture)
1304         {
1305                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1306                 return;
1307         }
1308
1309         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1310         command->unitnum = unitnum;
1311         command->texture = texture;
1312
1313         dpsoftrast.texbound[unitnum] = texture;
1314         if (texture)
1315                 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1316 }
1317
1318 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1319 {
1320         dpsoftrast.pointer_vertex3f = vertex3f;
1321         dpsoftrast.stride_vertex = stride;
1322 }
1323 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1324 {
1325         dpsoftrast.pointer_color4f = color4f;
1326         dpsoftrast.pointer_color4ub = NULL;
1327         dpsoftrast.stride_color = stride;
1328 }
1329 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1330 {
1331         dpsoftrast.pointer_color4f = NULL;
1332         dpsoftrast.pointer_color4ub = color4ub;
1333         dpsoftrast.stride_color = stride;
1334 }
1335 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1336 {
1337         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1338         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1339         dpsoftrast.stride_texcoord[unitnum] = stride;
1340 }
1341
1342 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1343 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1344 {
1345         thread->shader_mode = command->mode;
1346         thread->shader_permutation = command->permutation;
1347         thread->shader_exactspecularmath = command->exactspecularmath;
1348 }
1349 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1350 {
1351         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1352         command->mode = mode;
1353         command->permutation = permutation;
1354         command->exactspecularmath = exactspecularmath;
1355
1356         dpsoftrast.shader_mode = mode;
1357         dpsoftrast.shader_permutation = permutation;
1358         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1359 }
1360
1361 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1362 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1363 {
1364         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1365 }
1366 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1367 {
1368         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1369         command->index = index;
1370         command->val[0] = v0;
1371         command->val[1] = v1;
1372         command->val[2] = v2;
1373         command->val[3] = v3;
1374
1375         dpsoftrast.uniform4f[index*4+0] = v0;
1376         dpsoftrast.uniform4f[index*4+1] = v1;
1377         dpsoftrast.uniform4f[index*4+2] = v2;
1378         dpsoftrast.uniform4f[index*4+3] = v3;
1379 }
1380 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1381 {
1382         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1383         command->index = index;
1384         memcpy(command->val, v, sizeof(command->val));
1385
1386         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1387 }
1388
1389 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1390 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1391 {
1392         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1393 }
1394 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1395 {
1396 #ifdef SSE_POSSIBLE
1397         int i, index;
1398         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1399         {
1400                 __m128 m0, m1, m2, m3;
1401                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1402                 command->index = (DPSOFTRAST_UNIFORM)index;
1403                 if (((size_t)v)&(ALIGN_SIZE-1))
1404                 {
1405                         m0 = _mm_loadu_ps(v);
1406                         m1 = _mm_loadu_ps(v+4);
1407                         m2 = _mm_loadu_ps(v+8);
1408                         m3 = _mm_loadu_ps(v+12);
1409                 }
1410                 else
1411                 {
1412                         m0 = _mm_load_ps(v);
1413                         m1 = _mm_load_ps(v+4);
1414                         m2 = _mm_load_ps(v+8);
1415                         m3 = _mm_load_ps(v+12);
1416                 }
1417                 if (transpose)
1418                 {
1419                         __m128 t0, t1, t2, t3;
1420                         t0 = _mm_unpacklo_ps(m0, m1);
1421                         t1 = _mm_unpacklo_ps(m2, m3);
1422                         t2 = _mm_unpackhi_ps(m0, m1);
1423                         t3 = _mm_unpackhi_ps(m2, m3);
1424                         m0 = _mm_movelh_ps(t0, t1);
1425                         m1 = _mm_movehl_ps(t1, t0);
1426                         m2 = _mm_movelh_ps(t2, t3);
1427                         m3 = _mm_movehl_ps(t3, t2);                     
1428                 }
1429                 _mm_store_ps(command->val, m0);
1430                 _mm_store_ps(command->val+4, m1);
1431                 _mm_store_ps(command->val+8, m2);
1432                 _mm_store_ps(command->val+12, m3);
1433                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1434                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1435                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1436                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1437         }
1438 #endif
1439 }
1440
1441 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1442 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1443 {
1444         thread->uniform1i[command->index] = command->val;
1445 }
1446 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1447 {
1448         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1449         command->index = index;
1450         command->val = i0;
1451
1452         dpsoftrast.uniform1i[command->index] = i0;
1453 }
1454
1455 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1456 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1457 {
1458         memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1459         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1460 }
1461 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1462 {
1463         DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1464         command->clipplane[0] = x;
1465         command->clipplane[1] = y;
1466         command->clipplane[2] = z;
1467         command->clipplane[3] = w;
1468 }
1469
1470 #ifdef SSE_POSSIBLE
1471 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1472 {
1473         float *end = dst + size*4;
1474         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1475         {
1476                 while (dst < end)
1477                 {
1478                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1479                         dst += 4;
1480                         src += stride;
1481                 }
1482         }
1483         else
1484         {
1485                 while (dst < end)
1486                 {
1487                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1488                         dst += 4;
1489                         src += stride;
1490                 }
1491         }
1492 }
1493
1494 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1495 {
1496         float *end = dst + size*4;
1497         if (stride == sizeof(float[3]))
1498         {
1499                 float *end4 = dst + (size&~3)*4;        
1500                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1501                 {
1502                         while (dst < end4)
1503                         {
1504                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1505                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1506                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1507                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1508                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1509                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1510                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1511                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1512                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1513                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1514                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1515                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1516                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1517                                 dst += 16;
1518                                 src += 4*sizeof(float[3]);
1519                         }
1520                 }
1521                 else
1522                 {
1523                         while (dst < end4)
1524                         {
1525                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1526                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1527                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1528                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1529                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1530                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1531                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1532                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1533                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1534                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1535                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1536                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1537                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1538                                 dst += 16;
1539                                 src += 4*sizeof(float[3]);
1540                         }
1541                 }
1542         }
1543         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1544         {
1545                 while (dst < end)
1546                 {
1547                         __m128 v = _mm_loadu_ps((const float *)src);
1548                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1549                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1550                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1551                         _mm_store_ps(dst, v);
1552                         dst += 4;
1553                         src += stride;
1554                 }
1555         }
1556         else
1557         {
1558                 while (dst < end)
1559                 {
1560                         __m128 v = _mm_load_ps((const float *)src);
1561                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1562                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1563                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1564                         _mm_store_ps(dst, v);
1565                         dst += 4;
1566                         src += stride;
1567                 }
1568         }
1569 }
1570
1571 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1572 {
1573         float *end = dst + size*4;
1574         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1575         if (stride == sizeof(float[2]))
1576         {
1577                 float *end2 = dst + (size&~1)*4;
1578                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1579                 {
1580                         while (dst < end2)
1581                         {
1582                                 __m128 v = _mm_loadu_ps((const float *)src);
1583                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1584                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1585                                 dst += 8;
1586                                 src += 2*sizeof(float[2]);
1587                         }
1588                 }
1589                 else
1590                 {
1591                         while (dst < end2)
1592                         {
1593                                 __m128 v = _mm_load_ps((const float *)src);
1594                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1595                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1596                                 dst += 8;
1597                                 src += 2*sizeof(float[2]);
1598                         }
1599                 }
1600         }
1601         while (dst < end)
1602         {
1603                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1604                 dst += 4;
1605                 src += stride;
1606         }
1607 }
1608
1609 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1610 {
1611         float *end = dst + size*4;
1612         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1613         if (stride == sizeof(unsigned char[4]))
1614         {
1615                 float *end4 = dst + (size&~3)*4;
1616                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1617                 {
1618                         while (dst < end4)
1619                         {
1620                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1621                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1622                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1623                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1624                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1625                                 dst += 16;
1626                                 src += 4*sizeof(unsigned char[4]);
1627                         }
1628                 }
1629                 else
1630                 {
1631                         while (dst < end4)
1632                         {
1633                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1634                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1635                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1636                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1637                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1638                                 dst += 16;
1639                                 src += 4*sizeof(unsigned char[4]);
1640                         }
1641                 }
1642         }
1643         while (dst < end)
1644         {
1645                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1646                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1647                 dst += 4;
1648                 src += stride;
1649         }
1650 }
1651
1652 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1653 {
1654         float *end = dst + 4*size;
1655         __m128 v = _mm_loadu_ps(src);
1656         while (dst < end)
1657         {
1658                 _mm_store_ps(dst, v);
1659                 dst += 4;
1660         }
1661 }
1662 #endif
1663
1664 static void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1665 {
1666 #ifdef SSE_POSSIBLE
1667         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1668         __m128 m0, m1, m2, m3;
1669         float *end;
1670         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1671         {
1672                 // fast case for identity matrix
1673                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1674                 return;
1675         }
1676         end = out4f + numitems*4;
1677         m0 = _mm_loadu_ps(inmatrix16f);
1678         m1 = _mm_loadu_ps(inmatrix16f + 4);
1679         m2 = _mm_loadu_ps(inmatrix16f + 8);
1680         m3 = _mm_loadu_ps(inmatrix16f + 12);
1681         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1682         {
1683                 while (out4f < end)
1684                 {
1685                         __m128 v = _mm_loadu_ps(in4f);
1686                         _mm_store_ps(out4f,
1687                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1688                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1689                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1690                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1691                         out4f += 4;
1692                         in4f += 4;
1693                 }
1694         }
1695         else
1696         {
1697                 while (out4f < end)
1698                 {
1699                         __m128 v = _mm_load_ps(in4f);
1700                         _mm_store_ps(out4f,
1701                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1702                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1703                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1704                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1705                         out4f += 4;
1706                         in4f += 4;
1707                 }
1708         }
1709 #endif
1710 }
1711
1712 #if 0
1713 static void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1714 {
1715         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1716 }
1717 #endif
1718
1719 #ifdef SSE_POSSIBLE
1720 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1721 { \
1722         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1723         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1724         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1725         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1726 }
1727
1728 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1729 { \
1730         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1731         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1732         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1733         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1734 }
1735
1736 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1737 { \
1738         __m128 p = (in); \
1739         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1740                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1741                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1742                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1743 }
1744
1745 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1746 {
1747         int clipmask = 0xFF;
1748         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1749         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1750         __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1751         __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1752         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1753         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1754         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1755         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1756         #define BBFRONT(k, pos) \
1757         { \
1758                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1759                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1760                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1761                 { \
1762                         __m128 proj; \
1763                         clipmask &= ~(1<<k); \
1764                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1765                         minproj = _mm_min_ss(minproj, proj); \
1766                         maxproj = _mm_max_ss(maxproj, proj); \
1767                 } \
1768         }
1769         BBFRONT(0, minpos); 
1770         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1771         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1772         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1773         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1774         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1775         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1776         BBFRONT(7, maxpos);
1777         #define BBCLIP(k) \
1778         { \
1779                 if (clipmask&(1<<k)) \
1780                 { \
1781                         if (!(clipmask&(1<<(k^1)))) \
1782                         { \
1783                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1784                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1785                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1786                                 minproj = _mm_min_ss(minproj, proj); \
1787                                 maxproj = _mm_max_ss(maxproj, proj); \
1788                         } \
1789                         if (!(clipmask&(1<<(k^2)))) \
1790                         { \
1791                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1792                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1793                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1794                                 minproj = _mm_min_ss(minproj, proj); \
1795                                 maxproj = _mm_max_ss(maxproj, proj); \
1796                         } \
1797                         if (!(clipmask&(1<<(k^4)))) \
1798                         { \
1799                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1800                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1801                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1802                                 minproj = _mm_min_ss(minproj, proj); \
1803                                 maxproj = _mm_max_ss(maxproj, proj); \
1804                         } \
1805                 } \
1806         }
1807         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1808         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1809         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1810         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1811         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1812         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1813         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1814         *starty = _mm_cvttss_si32(maxproj);
1815         *endy = _mm_cvttss_si32(minproj)+1;
1816         return clipmask;
1817 }
1818         
1819 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1820 {
1821         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1822         float *end = out4f + numitems*4;
1823         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1824         __m128 minpos, maxpos;
1825         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1826         {
1827                 minpos = maxpos = _mm_loadu_ps(in4f);
1828                 while (out4f < end)
1829                 {
1830                         __m128 v = _mm_loadu_ps(in4f);
1831                         minpos = _mm_min_ps(minpos, v);
1832                         maxpos = _mm_max_ps(maxpos, v);
1833                         _mm_store_ps(out4f, v);
1834                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1835                         _mm_store_ps(screen4f, v);
1836                         in4f += 4;
1837                         out4f += 4;
1838                         screen4f += 4;
1839                 }
1840         }
1841         else
1842         {
1843                 minpos = maxpos = _mm_load_ps(in4f);
1844                 while (out4f < end)
1845                 {
1846                         __m128 v = _mm_load_ps(in4f);
1847                         minpos = _mm_min_ps(minpos, v);
1848                         maxpos = _mm_max_ps(maxpos, v);
1849                         _mm_store_ps(out4f, v);
1850                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1851                         _mm_store_ps(screen4f, v);
1852                         in4f += 4;
1853                         out4f += 4;
1854                         screen4f += 4;
1855                 }
1856         }
1857         if (starty && endy) 
1858         {
1859                 ALIGN(float minposf[4]);
1860                 ALIGN(float maxposf[4]);
1861                 _mm_store_ps(minposf, minpos);
1862                 _mm_store_ps(maxposf, maxpos);
1863                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1864         }
1865         return 0;
1866 }
1867
1868 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1869 {
1870         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1871         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1872         float *end;
1873         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1874                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1875         end = out4f + numitems*4;
1876         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1877         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1878         m0 = _mm_loadu_ps(inmatrix16f);
1879         m1 = _mm_loadu_ps(inmatrix16f + 4);
1880         m2 = _mm_loadu_ps(inmatrix16f + 8);
1881         m3 = _mm_loadu_ps(inmatrix16f + 12);
1882         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1883         {
1884                 minpos = maxpos = _mm_loadu_ps(in4f);
1885                 while (out4f < end)
1886                 {
1887                         __m128 v = _mm_loadu_ps(in4f);
1888                         minpos = _mm_min_ps(minpos, v);
1889                         maxpos = _mm_max_ps(maxpos, v);
1890                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1891                         _mm_store_ps(out4f, v);
1892                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1893                         _mm_store_ps(screen4f, v);
1894                         in4f += 4;
1895                         out4f += 4;
1896                         screen4f += 4;
1897                 }
1898         }
1899         else
1900         {
1901                 minpos = maxpos = _mm_load_ps(in4f);
1902                 while (out4f < end)
1903                 {
1904                         __m128 v = _mm_load_ps(in4f);
1905                         minpos = _mm_min_ps(minpos, v);
1906                         maxpos = _mm_max_ps(maxpos, v);
1907                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1908                         _mm_store_ps(out4f, v);
1909                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1910                         _mm_store_ps(screen4f, v);
1911                         in4f += 4;
1912                         out4f += 4;
1913                         screen4f += 4;
1914                 }
1915         }
1916         if (starty && endy) 
1917         {
1918                 ALIGN(float minposf[4]);
1919                 ALIGN(float maxposf[4]);
1920                 _mm_store_ps(minposf, minpos);
1921                 _mm_store_ps(maxposf, maxpos);
1922                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f); 
1923         }
1924         return 0;
1925 }
1926 #endif
1927
1928 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1929 {
1930 #ifdef SSE_POSSIBLE
1931         float *outf = dpsoftrast.post_array4f[outarray];
1932         const unsigned char *inb;
1933         int firstvertex = dpsoftrast.firstvertex;
1934         int numvertices = dpsoftrast.numvertices;
1935         int stride;
1936         switch(inarray)
1937         {
1938         case DPSOFTRAST_ARRAY_POSITION:
1939                 stride = dpsoftrast.stride_vertex;
1940                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1941                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1942                 break;
1943         case DPSOFTRAST_ARRAY_COLOR:
1944                 stride = dpsoftrast.stride_color;
1945                 if (dpsoftrast.pointer_color4f)
1946                 {
1947                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1948                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1949                 }
1950                 else if (dpsoftrast.pointer_color4ub)
1951                 {
1952                         stride = dpsoftrast.stride_color;
1953                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1954                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1955                 }
1956                 else
1957                 {
1958                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1959                 }
1960                 break;
1961         default:
1962                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1963                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1964                 {
1965                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1966                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1967                         {
1968                         case 2:
1969                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1970                                 break;
1971                         case 3:
1972                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1973                                 break;
1974                         case 4:
1975                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1976                                 break;
1977                         }
1978                 }
1979                 break;
1980         }
1981         return outf;
1982 #else
1983         return NULL;
1984 #endif
1985 }
1986
1987 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1988 {
1989         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1990         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1991         return data;
1992 }
1993
1994 #if 0
1995 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1996 {
1997 #ifdef SSE_POSSIBLE
1998         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1999         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
2000         return data;
2001 #else
2002         return NULL;
2003 #endif
2004 }
2005 #endif
2006
2007 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2008 {
2009 #ifdef SSE_POSSIBLE
2010         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2011         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2012         return data;
2013 #else
2014         return NULL;
2015 #endif
2016 }
2017
2018 static void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2019 {
2020         int x;
2021         int startx = span->startx;
2022         int endx = span->endx;
2023         float wslope = triangle->w[0];
2024         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2025         float endz = 1.0f / (w + wslope * startx);
2026         if (triangle->w[0] == 0)
2027         {
2028                 // LordHavoc: fast flat polygons (HUD/menu)
2029                 for (x = startx;x < endx;x++)
2030                         zf[x] = endz;
2031                 return;
2032         }
2033         for (x = startx;x < endx;)
2034         {
2035                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2036                 float z = endz, dz;
2037                 if (nextsub >= endx) nextsub = endsub = endx-1;
2038                 endz = 1.0f / (w + wslope * nextsub);
2039                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2040                 for (; x <= endsub; x++, z += dz)
2041                         zf[x] = z;
2042         }
2043 }
2044
2045 static void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2046 {
2047 #ifdef SSE_POSSIBLE
2048         int x;
2049         int startx = span->startx;
2050         int endx = span->endx;
2051         int maskx;
2052         int subx;
2053         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2054         unsigned char * RESTRICT pixelmask = span->pixelmask;
2055         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2056         if (!pixeli)
2057                 return;
2058         pixeli += span->y * dpsoftrast.fb_width + span->x;
2059         // handle alphatest now (this affects depth writes too)
2060         if (thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL)
2061                 for (x = startx;x < endx;x++)
2062                         if (in4ub[x*4+3] < 128)
2063                                 pixelmask[x] = false;
2064         // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2065         // helps sprites, text and hud artwork
2066         switch(thread->fb_blendmode)
2067         {
2068         case DPSOFTRAST_BLENDMODE_ALPHA:
2069         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2070         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2071                 maskx = startx;
2072                 for (x = startx;x < endx;x++)
2073                 {
2074                         if (in4ub[x*4+3] >= 1)
2075                         {
2076                                 startx = x;
2077                                 for (;;)
2078                                 {
2079                                         while (++x < endx && in4ub[x*4+3] >= 1) ;
2080                                         maskx = x;
2081                                         if (x >= endx) break;
2082                                         ++x;
2083                                         while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2084                                         if (x >= endx) break;
2085                                 }
2086                                 break;
2087                         }
2088                 }
2089                 endx = maskx;
2090                 break;
2091         case DPSOFTRAST_BLENDMODE_OPAQUE:
2092         case DPSOFTRAST_BLENDMODE_ADD:
2093         case DPSOFTRAST_BLENDMODE_INVMOD:
2094         case DPSOFTRAST_BLENDMODE_MUL:
2095         case DPSOFTRAST_BLENDMODE_MUL2:
2096         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2097         case DPSOFTRAST_BLENDMODE_INVADD:
2098                 break;
2099         }
2100         // put some special values at the end of the mask to ensure the loops end
2101         pixelmask[endx] = 1;
2102         pixelmask[endx+1] = 0;
2103         // LordHavoc: use a double loop to identify subspans, this helps the
2104         // optimized copy/blend loops to perform at their best, most triangles
2105         // have only one run of pixels, and do the search using wide reads...
2106         x = startx;
2107         while (x < endx)
2108         {
2109                 // if this pixel is masked off, it's probably not alone...
2110                 if (!pixelmask[x])
2111                 {
2112                         x++;
2113 #if 1
2114                         if (x + 8 < endx)
2115                         {
2116                                 // the 4-item search must be aligned or else it stalls badly
2117                                 if ((x & 3) && !pixelmask[x]) 
2118                                 {
2119                                         if(pixelmask[x]) goto endmasked;
2120                                         x++;
2121                                         if (x & 3)
2122                                         {
2123                                                 if(pixelmask[x]) goto endmasked;
2124                                                 x++;
2125                                                 if (x & 3)
2126                                                 {
2127                                                         if(pixelmask[x]) goto endmasked;
2128                                                         x++;
2129                                                 }
2130                                         }
2131                                 }
2132                                 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2133                                         x += 4;
2134                         }
2135 #endif
2136                         for (;!pixelmask[x];x++)
2137                                 ;
2138                         // rather than continue the loop, just check the end variable
2139                         if (x >= endx)
2140                                 break;
2141                 }
2142         endmasked:
2143                 // find length of subspan
2144                 subx = x + 1;
2145 #if 1
2146                 if (subx + 8 < endx)
2147                 {
2148                         if (subx & 3)
2149                         {
2150                                 if(!pixelmask[subx]) goto endunmasked;
2151                                 subx++;
2152                                 if (subx & 3)
2153                                 {
2154                                         if(!pixelmask[subx]) goto endunmasked;
2155                                         subx++;
2156                                         if (subx & 3)
2157                                         {
2158                                                 if(!pixelmask[subx]) goto endunmasked;
2159                                                 subx++;
2160                                         }
2161                                 }
2162                         }
2163                         while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2164                                 subx += 4;
2165                 }
2166 #endif
2167                 for (;pixelmask[subx];subx++)
2168                         ;
2169                 // the checks can overshoot, so make sure to clip it...
2170                 if (subx > endx)
2171                         subx = endx;
2172         endunmasked:
2173                 // now that we know the subspan length...  process!
2174                 switch(thread->fb_blendmode)
2175                 {
2176                 case DPSOFTRAST_BLENDMODE_OPAQUE:
2177 #if 0
2178                         if (subx - x >= 16)
2179                         {
2180                                 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2181                                 x = subx;
2182                         }
2183                         else
2184 #elif 1
2185                         while (x + 16 <= subx)
2186                         {
2187                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2188                                 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2189                                 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2190                                 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2191                                 x += 16;
2192                         }
2193 #endif
2194                         {
2195                                 while (x + 4 <= subx)
2196                                 {
2197                                         _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2198                                         x += 4;
2199                                 }
2200                                 if (x + 2 <= subx)
2201                                 {
2202                                         pixeli[x] = ini[x];
2203                                         pixeli[x+1] = ini[x+1];
2204                                         x += 2;
2205                                 }
2206                                 if (x < subx)
2207                                 {
2208                                         pixeli[x] = ini[x];
2209                                         x++;
2210                                 }
2211                         }
2212                         break;
2213                 case DPSOFTRAST_BLENDMODE_ALPHA:
2214                 #define FINISHBLEND(blend2, blend1) \
2215                         for (;x + 1 < subx;x += 2) \
2216                         { \
2217                                 __m128i src, dst; \
2218                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2219                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2220                                 blend2; \
2221                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2222                         } \
2223                         if (x < subx) \
2224                         { \
2225                                 __m128i src, dst; \
2226                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2227                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2228                                 blend1; \
2229                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2230                                 x++; \
2231                         }
2232                         FINISHBLEND({
2233                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2234                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2235                         }, {
2236                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2237                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2238                         });
2239                         break;
2240                 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2241                         FINISHBLEND({
2242                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2243                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2244                         }, {
2245                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2246                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2247                         });
2248                         break;
2249                 case DPSOFTRAST_BLENDMODE_ADD:
2250                         FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2251                         break;
2252                 case DPSOFTRAST_BLENDMODE_INVMOD:
2253                         FINISHBLEND({
2254                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2255                         }, {
2256                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2257                         });
2258                         break;
2259                 case DPSOFTRAST_BLENDMODE_MUL:
2260                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2261                         break;
2262                 case DPSOFTRAST_BLENDMODE_MUL2:
2263                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2264                         break;
2265                 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2266                         FINISHBLEND({
2267                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2268                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2269                         }, {
2270                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2271                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2272                         });
2273                         break;
2274                 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2275                         FINISHBLEND({
2276                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2277                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2278                         }, {
2279                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2280                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2281                         });
2282                         break;
2283                 case DPSOFTRAST_BLENDMODE_INVADD:
2284                         FINISHBLEND({
2285                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2286                         }, {
2287                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2288                         });
2289                         break;
2290                 }
2291         }
2292 #endif
2293 }
2294
2295 static void DPSOFTRAST_Texture2DBGRA8(DPSOFTRAST_Texture *texture, int mip, float x, float y, unsigned char c[4])
2296         // warning: this is SLOW, only use if the optimized per-span functions won't do
2297 {
2298         const unsigned char * RESTRICT pixelbase;
2299         const unsigned char * RESTRICT pixel[4];
2300         int width = texture->mipmap[mip][2], height = texture->mipmap[mip][3];
2301         int wrapmask[2] = { width-1, height-1 };
2302         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*width;
2303         if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
2304         {
2305                 unsigned int tc[2] = { x * (width<<12) - 2048, y * (height<<12) - 2048};
2306                 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2307                 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2308                 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2309                 int tci[2] = { tc[0]>>12, tc[1]>>12 };
2310                 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2311                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2312                 {
2313                         tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2314                         tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2315                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= wrapmask[0] ? tci1[0] : wrapmask[0]) : 0;
2316                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= wrapmask[1] ? tci1[1] : wrapmask[1]) : 0;
2317                 }
2318                 else
2319                 {
2320                         tci[0] &= wrapmask[0];
2321                         tci[1] &= wrapmask[1];
2322                         tci1[0] &= wrapmask[0];
2323                         tci1[1] &= wrapmask[1];
2324                 }
2325                 pixel[0] = pixelbase + 4 * (tci[0] - tci[1]*width);
2326                 pixel[1] = pixelbase + 4 * (tci[0] - tci[1]*width);
2327                 pixel[2] = pixelbase + 4 * (tci[0] - tci1[1]*width);
2328                 pixel[3] = pixelbase + 4 * (tci[0] - tci1[1]*width);
2329                 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
2330                 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
2331                 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
2332                 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
2333         }
2334         else
2335         {
2336                 int tci[2] = { x * width, y * height };
2337                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2338                 {
2339                         tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2340                         tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2341                 }
2342                 else
2343                 {
2344                         tci[0] &= wrapmask[0];
2345                         tci[1] &= wrapmask[1];
2346                 }
2347                 pixel[0] = pixelbase + 4 * (tci[0] - tci[1]*width);
2348                 c[0] = pixel[0][0];
2349                 c[1] = pixel[0][1];
2350                 c[2] = pixel[0][2];
2351                 c[3] = pixel[0][3];
2352         }
2353 }
2354
2355 #if 0
2356 static void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2357 {
2358         int x;
2359         int startx = span->startx;
2360         int endx = span->endx;
2361         int flags;
2362         float c[4];
2363         float data[4];
2364         float slope[4];
2365         float tc[2], endtc[2];
2366         float tcscale[2];
2367         unsigned int tci[2];
2368         unsigned int tci1[2];
2369         unsigned int tcimin[2];
2370         unsigned int tcimax[2];
2371         int tciwrapmask[2];
2372         int tciwidth;
2373         int filter;
2374         int mip;
2375         const unsigned char * RESTRICT pixelbase;
2376         const unsigned char * RESTRICT pixel[4];
2377         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2378         // if no texture is bound, just fill it with white
2379         if (!texture)
2380         {
2381                 for (x = startx;x < endx;x++)
2382                 {
2383                         out4f[x*4+0] = 1.0f;
2384                         out4f[x*4+1] = 1.0f;
2385                         out4f[x*4+2] = 1.0f;
2386                         out4f[x*4+3] = 1.0f;
2387                 }
2388                 return;
2389         }
2390         mip = triangle->mip[texunitindex];
2391         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*texture->mipmap[mip][2];
2392         // if this mipmap of the texture is 1 pixel, just fill it with that color
2393         if (texture->mipmap[mip][1] == 4)
2394         {
2395                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2396                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2397                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2398                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2399                 for (x = startx;x < endx;x++)
2400                 {
2401                         out4f[x*4+0] = c[0];
2402                         out4f[x*4+1] = c[1];
2403                         out4f[x*4+2] = c[2];
2404                         out4f[x*4+3] = c[3];
2405                 }
2406                 return;
2407         }
2408         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2409         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2410         flags = texture->flags;
2411         tcscale[0] = texture->mipmap[mip][2];
2412         tcscale[1] = texture->mipmap[mip][3];
2413         tciwidth = -texture->mipmap[mip][2];
2414         tcimin[0] = 0;
2415         tcimin[1] = 0;
2416         tcimax[0] = texture->mipmap[mip][2]-1;
2417         tcimax[1] = texture->mipmap[mip][3]-1;
2418         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2419         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2420         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2421         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2422         if (filter)
2423         {
2424                 endtc[0] -= 0.5f;
2425                 endtc[1] -= 0.5f;
2426         }
2427         for (x = startx;x < endx;)
2428         {
2429                 unsigned int subtc[2];
2430                 unsigned int substep[2];
2431                 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2432                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2433                 if (nextsub >= endx)
2434                 {
2435                         nextsub = endsub = endx-1;      
2436                         if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2437                 }
2438                 tc[0] = endtc[0];
2439                 tc[1] = endtc[1];
2440                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2441                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2442                 if (filter)
2443                 {
2444                         endtc[0] -= 0.5f;
2445                         endtc[1] -= 0.5f;
2446                 }
2447                 substep[0] = (endtc[0] - tc[0]) * subscale;
2448                 substep[1] = (endtc[1] - tc[1]) * subscale;
2449                 subtc[0] = tc[0] * (1<<12);
2450                 subtc[1] = tc[1] * (1<<12);
2451                 if (filter)
2452                 {
2453                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2454                         {
2455                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2456                                 {
2457                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2458                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2459                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2460                                         tci[0] = subtc[0]>>12;
2461                                         tci[1] = subtc[1]>>12;
2462                                         tci1[0] = tci[0] + 1;
2463                                         tci1[1] = tci[1] + 1;
2464                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2465                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2466                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2467                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2468                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2469                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2470                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2471                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2472                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2473                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2474                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2475                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2476                                         out4f[x*4+0] = c[0];
2477                                         out4f[x*4+1] = c[1];
2478                                         out4f[x*4+2] = c[2];
2479                                         out4f[x*4+3] = c[3];
2480                                 }
2481                         }
2482                         else
2483                         {
2484                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2485                                 {
2486                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2487                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2488                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2489                                         tci[0] = subtc[0]>>12;
2490                                         tci[1] = subtc[1]>>12;
2491                                         tci1[0] = tci[0] + 1;
2492                                         tci1[1] = tci[1] + 1;
2493                                         tci[0] &= tciwrapmask[0];
2494                                         tci[1] &= tciwrapmask[1];
2495                                         tci1[0] &= tciwrapmask[0];
2496                                         tci1[1] &= tciwrapmask[1];
2497                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2498                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2499                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2500                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2501                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2502                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2503                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2504                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2505                                         out4f[x*4+0] = c[0];
2506                                         out4f[x*4+1] = c[1];
2507                                         out4f[x*4+2] = c[2];
2508                                         out4f[x*4+3] = c[3];
2509                                 }
2510                         }
2511                 }
2512                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2513                 {
2514                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2515                         {
2516                                 tci[0] = subtc[0]>>12;
2517                                 tci[1] = subtc[1]>>12;
2518                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2519                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2520                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2521                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2522                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2523                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2524                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2525                                 out4f[x*4+0] = c[0];
2526                                 out4f[x*4+1] = c[1];
2527                                 out4f[x*4+2] = c[2];
2528                                 out4f[x*4+3] = c[3];
2529                         }
2530                 }
2531                 else
2532                 {
2533                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2534                         {
2535                                 tci[0] = subtc[0]>>12;
2536                                 tci[1] = subtc[1]>>12;
2537                                 tci[0] &= tciwrapmask[0];
2538                                 tci[1] &= tciwrapmask[1];
2539                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2540                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2541                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2542                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2543                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2544                                 out4f[x*4+0] = c[0];
2545                                 out4f[x*4+1] = c[1];
2546                                 out4f[x*4+2] = c[2];
2547                                 out4f[x*4+3] = c[3];
2548                         }
2549                 }
2550         }
2551 }
2552 #endif
2553
2554 static void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2555 {
2556 #ifdef SSE_POSSIBLE
2557         int x;
2558         int startx = span->startx;
2559         int endx = span->endx;
2560         int flags;
2561         __m128 data, slope, tcscale;
2562         __m128i tcsize, tcmask, tcoffset, tcmax;
2563         __m128 tc, endtc;
2564         __m128i subtc, substep, endsubtc;
2565         int filter;
2566         int mip;
2567         int affine; // LordHavoc: optimized affine texturing case
2568         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2569         const unsigned char * RESTRICT pixelbase;
2570         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2571         // if no texture is bound, just fill it with white
2572         if (!texture)
2573         {
2574                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2575                 return;
2576         }
2577         mip = triangle->mip[texunitindex];
2578         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*texture->mipmap[mip][2];
2579         // if this mipmap of the texture is 1 pixel, just fill it with that color
2580         if (texture->mipmap[mip][1] == 4)
2581         {
2582                 unsigned int k = *((const unsigned int *)pixelbase);
2583                 for (x = startx;x < endx;x++)
2584                         outi[x] = k;
2585                 return;
2586         }
2587         affine = zf[startx] == zf[endx-1];
2588         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2589         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2590         flags = texture->flags;
2591         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2592         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2593         tcscale = _mm_cvtepi32_ps(tcsize);
2594         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2595         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2596         endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2597         if (filter)
2598                 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2599         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2600         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_sub_epi32(_mm_setzero_si128(), _mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0))), 18), _mm_set1_epi32(4));
2601         tcmax = _mm_packs_epi32(tcmask, tcmask);
2602         for (x = startx;x < endx;)
2603         {
2604                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2605                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2606                 if (nextsub >= endx || affine)
2607                 {
2608                         nextsub = endsub = endx-1;
2609                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2610                 }       
2611                 tc = endtc;
2612                 subtc = endsubtc;
2613                 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2614                 if (filter)
2615                         endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2616                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2617                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2618                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2619                 substep = _mm_slli_epi32(substep, 1);
2620                 if (filter)
2621                 {
2622                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2623                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2624                         {
2625                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2626                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2627                                 {
2628                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2629                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2630                                         tci = _mm_madd_epi16(tci, tcoffset);
2631                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2632                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2633                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2634                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2635                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2636                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2637                                         fracm = _mm_srli_epi16(subtc, 1);
2638                                         pix1 = _mm_add_epi16(pix1,
2639                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2640                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2641                                         pix3 = _mm_add_epi16(pix3,
2642                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2643                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2644                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2645                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2646                                         pix2 = _mm_add_epi16(pix2,
2647                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2648                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2649                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2650                                 }
2651                                 if (x <= endsub)
2652                                 {
2653                                         const unsigned char * RESTRICT ptr1;
2654                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2655                                         tci = _mm_madd_epi16(tci, tcoffset);
2656                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2657                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2658                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2659                                         fracm = _mm_srli_epi16(subtc, 1);
2660                                         pix1 = _mm_add_epi16(pix1,
2661                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2662                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2663                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2664                                         pix1 = _mm_add_epi16(pix1,
2665                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2666                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2667                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2668                                         x++;
2669                                 }
2670                         }
2671                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2672                         {
2673                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2674                                 {
2675                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2676                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2677                                         tci = _mm_madd_epi16(tci, tcoffset);
2678                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2679                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2680                                                                                         _mm_setzero_si128());
2681                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2682                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2683                                                                                         _mm_setzero_si128());
2684                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2685                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2686                                         tci = _mm_madd_epi16(tci, tcoffset);
2687                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2688                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2689                                                                                         _mm_setzero_si128());
2690                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2691                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2692                                                                                         _mm_setzero_si128());
2693                                         fracm = _mm_srli_epi16(subtc, 1);
2694                                         pix1 = _mm_add_epi16(pix1,
2695                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2696                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2697                                         pix3 = _mm_add_epi16(pix3,
2698                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2699                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2700                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2701                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2702                                         pix2 = _mm_add_epi16(pix2,
2703                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2704                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2705                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2706                                 }
2707                                 if (x <= endsub)
2708                                 {
2709                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2710                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2711                                         tci = _mm_madd_epi16(tci, tcoffset);
2712                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2713                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2714                                                                                         _mm_setzero_si128());
2715                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2716                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2717                                                                                         _mm_setzero_si128());
2718                                         fracm = _mm_srli_epi16(subtc, 1);
2719                                         pix1 = _mm_add_epi16(pix1,
2720                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2721                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2722                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2723                                         pix1 = _mm_add_epi16(pix1,
2724                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2725                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2726                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2727                                         x++;
2728                                 }
2729                         }
2730                         else
2731                         {
2732                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2733                                 {
2734                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2735                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2736                                         tci = _mm_madd_epi16(tci, tcoffset);
2737                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2738                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2739                                                                                         _mm_setzero_si128());
2740                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2741                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2742                                                                                         _mm_setzero_si128());
2743                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2744                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2745                                         tci = _mm_madd_epi16(tci, tcoffset);
2746                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2747                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2748                                                                                         _mm_setzero_si128());
2749                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2750                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2751                                                                                         _mm_setzero_si128());
2752                                         fracm = _mm_srli_epi16(subtc, 1);
2753                                         pix1 = _mm_add_epi16(pix1,
2754                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2755                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2756                                         pix3 = _mm_add_epi16(pix3,
2757                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2758                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2759                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2760                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2761                                         pix2 = _mm_add_epi16(pix2,
2762                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2763                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2764                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2765                                 }
2766                                 if (x <= endsub)
2767                                 {
2768                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2769                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2770                                         tci = _mm_madd_epi16(tci, tcoffset);
2771                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2772                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2773                                                                                         _mm_setzero_si128());
2774                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2775                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2776                                                                                         _mm_setzero_si128());
2777                                         fracm = _mm_srli_epi16(subtc, 1);
2778                                         pix1 = _mm_add_epi16(pix1,
2779                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2780                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2781                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2782                                         pix1 = _mm_add_epi16(pix1,
2783                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2784                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2785                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2786                                         x++;
2787                                 }
2788                         }
2789                 }
2790                 else
2791                 {
2792                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2793                         {
2794                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2795                                 {
2796                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2797                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2798                                         tci = _mm_madd_epi16(tci, tcoffset);
2799                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2800                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2801                                 }
2802                                 if (x <= endsub)
2803                                 {
2804                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2805                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2806                                         tci = _mm_madd_epi16(tci, tcoffset);
2807                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2808                                         x++;
2809                                 }
2810                         }
2811                         else
2812                         {
2813                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2814                                 {
2815                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2816                                         tci = _mm_and_si128(tci, tcmax); 
2817                                         tci = _mm_madd_epi16(tci, tcoffset);
2818                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2819                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2820                                 }
2821                                 if (x <= endsub)
2822                                 {
2823                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2824                                         tci = _mm_and_si128(tci, tcmax); 
2825                                         tci = _mm_madd_epi16(tci, tcoffset);
2826                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2827                                         x++;
2828                                 }
2829                         }
2830                 }
2831         }
2832 #endif
2833 }
2834
2835 static void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2836 {
2837         // TODO: IMPLEMENT
2838         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2839 }
2840
2841 static float DPSOFTRAST_SampleShadowmap(const float *vector)
2842 {
2843         // TODO: IMPLEMENT
2844         return 1.0f;
2845 }
2846
2847 #if 0
2848 static void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2849 {
2850         int x;
2851         int startx = span->startx;
2852         int endx = span->endx;
2853         float c[4];
2854         float data[4];
2855         float slope[4];
2856         float z;
2857         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2858         for (x = startx;x < endx;x++)
2859         {
2860                 z = zf[x];
2861                 c[0] = (data[0] + slope[0]*x) * z;
2862                 c[1] = (data[1] + slope[1]*x) * z;
2863                 c[2] = (data[2] + slope[2]*x) * z;
2864                 c[3] = (data[3] + slope[3]*x) * z;
2865                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2866                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2867                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2868                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2869         }
2870 }
2871 #endif
2872
2873 #if 0
2874 static void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2875 {
2876         int x;
2877         int startx = span->startx;
2878         int endx = span->endx;
2879         float c[4];
2880         float data[4];
2881         float slope[4];
2882         float z;
2883         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2884         for (x = startx;x < endx;x++)
2885         {
2886                 z = zf[x];
2887                 c[0] = (data[0] + slope[0]*x) * z;
2888                 c[1] = (data[1] + slope[1]*x) * z;
2889                 c[2] = (data[2] + slope[2]*x) * z;
2890                 c[3] = (data[3] + slope[3]*x) * z;
2891                 out4f[x*4+0] = c[0];
2892                 out4f[x*4+1] = c[1];
2893                 out4f[x*4+2] = c[2];
2894                 out4f[x*4+3] = c[3];
2895         }
2896 }
2897 #endif
2898
2899 #if 0
2900 static void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2901 {
2902         int x, startx = span->startx, endx = span->endx;
2903         float c[4], localcolor[4];
2904         localcolor[0] = subcolor[0];
2905         localcolor[1] = subcolor[1];
2906         localcolor[2] = subcolor[2];
2907         localcolor[3] = subcolor[3];
2908         for (x = startx;x < endx;x++)
2909         {
2910                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2911                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2912                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2913                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2914                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2915                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2916                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2917                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2918         }
2919 }
2920 #endif
2921
2922 #if 0
2923 static void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2924 {
2925         int x, startx = span->startx, endx = span->endx;
2926         for (x = startx;x < endx;x++)
2927         {
2928                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2929                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2930                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2931                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2932         }
2933 }
2934 #endif
2935
2936 #if 0
2937 static void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2938 {
2939         int x, startx = span->startx, endx = span->endx;
2940         for (x = startx;x < endx;x++)
2941         {
2942                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2943                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2944                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2945                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2946         }
2947 }
2948 #endif
2949
2950 #if 0
2951 static void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2952 {
2953         int x, startx = span->startx, endx = span->endx;
2954         float a, b;
2955         for (x = startx;x < endx;x++)
2956         {
2957                 a = 1.0f - inb4f[x*4+3];
2958                 b = inb4f[x*4+3];
2959                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2960                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2961                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2962                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2963         }
2964 }
2965 #endif
2966
2967 #if 0
2968 static void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2969 {
2970         int x, startx = span->startx, endx = span->endx;
2971         float localcolor[4], ilerp, lerp;
2972         localcolor[0] = color[0];
2973         localcolor[1] = color[1];
2974         localcolor[2] = color[2];
2975         localcolor[3] = color[3];
2976         ilerp = 1.0f - localcolor[3];
2977         lerp = localcolor[3];
2978         for (x = startx;x < endx;x++)
2979         {
2980                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2981                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2982                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2983                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2984         }
2985 }
2986 #endif
2987
2988
2989
2990 static void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2991 {
2992 #ifdef SSE_POSSIBLE
2993         int x;
2994         int startx = span->startx;
2995         int endx = span->endx;
2996         __m128 data, slope;
2997         __m128 mod, endmod;
2998         __m128i submod, substep, endsubmod;
2999         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3000         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3001         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3002         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3003         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3004         for (x = startx; x < endx;)
3005         {
3006                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3007                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3008                 if (nextsub >= endx)
3009                 {
3010                         nextsub = endsub = endx-1;
3011                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
3012                 }
3013                 mod = endmod;
3014                 submod = endsubmod;
3015                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3016                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3017                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3018                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3019                 substep = _mm_packs_epi32(substep, substep);
3020                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3021                 {
3022                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3023                         pix = _mm_mulhi_epu16(pix, submod);
3024                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3025                 }
3026                 if (x <= endsub)
3027                 {
3028                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3029                         pix = _mm_mulhi_epu16(pix, submod);
3030                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3031                         x++;
3032                 }
3033         }
3034 #endif
3035 }
3036
3037 static void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3038 {
3039 #ifdef SSE_POSSIBLE
3040         int x;
3041         int startx = span->startx;
3042         int endx = span->endx;
3043         __m128 data, slope;
3044         __m128 mod, endmod;
3045         __m128i submod, substep, endsubmod;
3046         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3047         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3048         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3049         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3050         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3051         for (x = startx; x < endx;)
3052         {
3053                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3054                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3055                 if (nextsub >= endx)
3056                 {
3057                         nextsub = endsub = endx-1;
3058                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3059                 }
3060                 mod = endmod;
3061                 submod = endsubmod;
3062                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3063                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3064                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3065                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3066                 substep = _mm_packs_epi32(substep, substep);
3067                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3068                 {
3069                         __m128i pix = _mm_srai_epi16(submod, 4);
3070                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3071                 }
3072                 if (x <= endsub)
3073                 {
3074                         __m128i pix = _mm_srai_epi16(submod, 4);
3075                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3076                         x++;
3077                 }
3078         }
3079 #endif
3080 }
3081
3082 static void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3083 {
3084 #ifdef SSE_POSSIBLE
3085         int x, startx = span->startx, endx = span->endx;
3086         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3087         localcolor = _mm_packs_epi32(localcolor, localcolor);
3088         for (x = startx;x+2 <= endx;x+=2)
3089         {
3090                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3091                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3092                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3093                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3094         }
3095         if (x < endx)
3096         {
3097                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3098                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3099                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3100                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3101         }
3102 #endif
3103 }
3104
3105 static void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3106 {
3107 #ifdef SSE_POSSIBLE
3108         int x, startx = span->startx, endx = span->endx;
3109         for (x = startx;x+2 <= endx;x+=2)
3110         {
3111                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3112                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3113                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3114                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3115         }
3116         if (x < endx)
3117         {
3118                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3119                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3120                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3121                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3122         }
3123 #endif
3124 }
3125
3126 static void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3127 {
3128 #ifdef SSE_POSSIBLE
3129         int x, startx = span->startx, endx = span->endx;
3130         for (x = startx;x+2 <= endx;x+=2)
3131         {
3132                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3133                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3134                 pix1 = _mm_add_epi16(pix1, pix2);
3135                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3136         }
3137         if (x < endx)
3138         {
3139                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3140                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3141                 pix1 = _mm_add_epi16(pix1, pix2);
3142                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3143         }
3144 #endif
3145 }
3146
3147 #if 0
3148 static void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3149 {
3150 #ifdef SSE_POSSIBLE
3151         int x, startx = span->startx, endx = span->endx;
3152         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3153         tint = _mm_packs_epi32(tint, tint);
3154         for (x = startx;x+2 <= endx;x+=2)
3155         {
3156                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3157                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3158                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3159                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3160         }
3161         if (x < endx)
3162         {
3163                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3164                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3165                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3166                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3167         }
3168 #endif
3169 }
3170 #endif
3171
3172 static void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3173 {
3174 #ifdef SSE_POSSIBLE
3175         int x, startx = span->startx, endx = span->endx;
3176         for (x = startx;x+2 <= endx;x+=2)
3177         {
3178                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3179                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3180                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3181                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3182                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3183         }
3184         if (x < endx)
3185         {
3186                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3187                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3188                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3189                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3190                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3191         }
3192 #endif
3193 }
3194
3195 static void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3196 {
3197 #ifdef SSE_POSSIBLE
3198         int x, startx = span->startx, endx = span->endx;
3199         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3200         localcolor = _mm_packs_epi32(localcolor, localcolor);
3201         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3202         for (x = startx;x+2 <= endx;x+=2)
3203         {
3204                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3205                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3206                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3207         }
3208         if (x < endx)
3209         {
3210                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3211                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3212                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3213         }
3214 #endif
3215 }
3216
3217
3218
3219 static void DPSOFTRAST_VertexShader_Generic(void)
3220 {
3221         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3222         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3223         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3224         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3225                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3226 }
3227
3228 static void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3229 {
3230         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3231         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3232         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3233         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3234         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3235         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3236         {
3237                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3238                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3239                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3240                 {
3241                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3242                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3243                         {
3244                                 // multiply
3245                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3246                         }
3247                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3248                         {
3249                                 // add
3250                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3251                         }
3252                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3253                         {
3254                                 // alphablend
3255                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3256                         }
3257                 }
3258         }
3259         else
3260                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3261         if(thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL)
3262         {
3263                 int x;
3264                 for (x = span->startx;x < span->endx;x++)
3265                         buffer_FragColorbgra8[x*4+3] = buffer_FragColorbgra8[x*4+3] * thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3266         }
3267         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3268 }
3269
3270
3271
3272 static void DPSOFTRAST_VertexShader_PostProcess(void)
3273 {
3274         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3275         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3276         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3277 }
3278
3279 static void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3280 {
3281         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3282         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3283         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3284         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3285         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3286         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3287         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3288         {
3289                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3290                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3291         }
3292         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3293         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3294         {
3295                 // TODO: implement saturation
3296         }
3297         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3298         {
3299                 // TODO: implement gammaramps
3300         }
3301         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3302 }
3303
3304
3305
3306 static void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3307 {
3308         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3309 }
3310
3311 static void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3312 {
3313         // this is never called (because colormask is off when this shader is used)
3314         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3315         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3316         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3317         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3318         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3319 }
3320
3321
3322
3323 static void DPSOFTRAST_VertexShader_FlatColor(void)
3324 {
3325         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3326         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3327 }
3328
3329 static void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3330 {
3331 #ifdef SSE_POSSIBLE
3332         unsigned char * RESTRICT pixelmask = span->pixelmask;
3333         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3334         int x, startx = span->startx, endx = span->endx;
3335         __m128i Color_Ambientm;
3336         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3337         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3338         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3339         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3340         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3341         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3342                 pixel = buffer_FragColorbgra8;
3343         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3344         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3345         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3346         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3347         for (x = startx;x < endx;x++)
3348         {
3349                 __m128i color, pix;
3350                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3351                 {
3352                         __m128i pix2;
3353                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3354                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3355                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3356                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3357                         x += 3;
3358                         continue;
3359                 }
3360                 if (!pixelmask[x])
3361                         continue;
3362                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3363                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3364                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3365         }
3366         if (pixel == buffer_FragColorbgra8)
3367                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3368 #endif
3369 }
3370
3371
3372
3373 static void DPSOFTRAST_VertexShader_VertexColor(void)
3374 {
3375         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3376         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3377         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3378 }
3379
3380 static void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3381 {
3382 #ifdef SSE_POSSIBLE
3383         unsigned char * RESTRICT pixelmask = span->pixelmask;
3384         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3385         int x, startx = span->startx, endx = span->endx;
3386         __m128i Color_Ambientm, Color_Diffusem;
3387         __m128 data, slope;
3388         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3389         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3390         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3391         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3392         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3393         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3394         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3395                 pixel = buffer_FragColorbgra8;
3396         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3397         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3398         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3399         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3400         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3401         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3402         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3403         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3404         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3405         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3406         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3407         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3408         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3409         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3410         {
3411                 __m128i color, mod, pix;
3412                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3413                 {
3414                         __m128i pix2, mod2;
3415                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3416                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3417                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3418                         data = _mm_add_ps(data, slope);
3419                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3420                         data = _mm_add_ps(data, slope);
3421                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3422                         data = _mm_add_ps(data, slope);
3423                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3424                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3425                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3426                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3427                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3428                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3429                         x += 3;
3430                         continue;
3431                 }
3432                 if (!pixelmask[x])
3433                         continue;
3434                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3435                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3436                 mod = _mm_packs_epi32(mod, mod);
3437                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3438                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3439         }
3440         if (pixel == buffer_FragColorbgra8)
3441                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3442 #endif
3443 }
3444
3445
3446
3447 static void DPSOFTRAST_VertexShader_Lightmap(void)
3448 {
3449         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3450         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3451         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3452 }
3453
3454 static void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3455 {
3456 #ifdef SSE_POSSIBLE
3457         unsigned char * RESTRICT pixelmask = span->pixelmask;
3458         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3459         int x, startx = span->startx, endx = span->endx;
3460         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3461         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3462         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3463         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3464         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3465         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3466         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3467         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3468         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3469         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3470                 pixel = buffer_FragColorbgra8;
3471         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3472         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3473         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3474         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3475         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3476         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3477         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3478         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3479         {
3480                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3481                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3482                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3483                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3484                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3485                 for (x = startx;x < endx;x++)
3486                 {
3487                         __m128i color, lightmap, glow, pix;
3488                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3489                         {
3490                                 __m128i pix2;
3491                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3492                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3493                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3494                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3495                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3496                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3497                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3498                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3499                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3500                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3501                                 x += 3;
3502                                 continue;
3503                         }
3504                         if (!pixelmask[x])
3505                                 continue;
3506                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3507                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3508                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3509                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3510                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3511                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3512                 }
3513         }
3514         else
3515         {
3516                 for (x = startx;x < endx;x++)
3517                 {
3518                         __m128i color, lightmap, pix;
3519                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3520                         {
3521                                 __m128i pix2;
3522                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3523                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3524                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3525                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3526                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3527                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3528                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3529                                 x += 3;
3530                                 continue;
3531                         }
3532                         if (!pixelmask[x]) 
3533                                 continue;
3534                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3535                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3536                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3537                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3538                 }
3539         }
3540         if (pixel == buffer_FragColorbgra8)
3541                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3542 #endif
3543 }
3544
3545
3546 void DPSOFTRAST_VertexShader_LightDirection(void);
3547 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3548
3549 static void DPSOFTRAST_VertexShader_FakeLight(void)
3550 {
3551         DPSOFTRAST_VertexShader_LightDirection();
3552 }
3553
3554 static void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3555 {
3556         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3557 }
3558
3559
3560
3561 static void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3562 {
3563         DPSOFTRAST_VertexShader_LightDirection();
3564         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3565 }
3566
3567 static void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3568 {
3569         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3570 }
3571
3572
3573
3574 static void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3575 {
3576         DPSOFTRAST_VertexShader_LightDirection();
3577         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3578 }
3579
3580 static void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3581 {
3582         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3583 }
3584
3585
3586
3587 void DPSOFTRAST_VertexShader_LightDirection(void)
3588 {
3589         int i;
3590         int numvertices = dpsoftrast.numvertices;
3591         float LightDir[4];
3592         float LightVector[4];
3593         float EyePosition[4];
3594         float EyeVectorModelSpace[4];
3595         float EyeVector[4];
3596         float position[4];
3597         float svector[4];
3598         float tvector[4];
3599         float normal[4];
3600         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3601         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3602         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3603         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3604         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3605         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3606         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3607         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3608         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3609         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3610         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3611         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3612         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3613         for (i = 0;i < numvertices;i++)
3614         {
3615                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3616                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3617                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3618                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3619                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3620                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3621                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3622                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3623                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3624                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3625                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3626                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3627                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3628                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3629                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3630                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3631                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3632                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3633                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3634                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3635                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3636                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3637                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3638                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3639                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3640                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3641                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3642                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3643                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3644         }
3645         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3646 }
3647
3648 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3649 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3650 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3651 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3652 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3653 #define DPSOFTRAST_Vector3Normalize(v)\
3654 do\
3655 {\
3656         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3657         if (len)\
3658         {\
3659                 len = 1.0f / len;\
3660                 v[0] *= len;\
3661                 v[1] *= len;\
3662                 v[2] *= len;\
3663         }\
3664 }\
3665 while(0)
3666
3667 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3668 {
3669         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3670         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3671         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3672         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3673         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3674         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3675         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3676         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3677         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3678         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3679         int x, startx = span->startx, endx = span->endx;
3680         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3681         float LightVectordata[4];
3682         float LightVectorslope[4];
3683         float EyeVectordata[4];
3684         float EyeVectorslope[4];
3685         float VectorSdata[4];
3686         float VectorSslope[4];
3687         float VectorTdata[4];
3688         float VectorTslope[4];
3689         float VectorRdata[4];
3690         float VectorRslope[4];
3691         float z;
3692         float diffusetex[4];
3693         float glosstex[4];
3694         float surfacenormal[4];
3695         float lightnormal[4];
3696         float lightnormal_modelspace[4];
3697         float eyenormal[4];
3698         float specularnormal[4];
3699         float diffuse;
3700         float specular;
3701         float SpecularPower;
3702         int d[4];
3703         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3704         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3705         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3706         Color_Glow[3] = 0.0f;
3707         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3708         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3709         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3710         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3711         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3712         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3713         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3714         Color_Pants[3] = 0.0f;
3715         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3716         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3717         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3718         Color_Shirt[3] = 0.0f;
3719         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3720         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3721         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3722         {
3723                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3724                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3725         }
3726         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3727         {
3728                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3729         }
3730         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3731         {
3732                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3733                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3734                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3735                 Color_Diffuse[3] = 0.0f;
3736                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3737                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3738                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3739                 LightColor[3] = 0.0f;
3740                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3741                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3742                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3743                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3744                 Color_Specular[3] = 0.0f;
3745                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3746                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3747                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3748
3749                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3750                 {
3751                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3752                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3753                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3754                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3755                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3756                 }
3757                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3758                 {
3759                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3760                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3761                 }
3762                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3763                 {
3764                         // nothing of this needed
3765                 }
3766                 else
3767                 {
3768                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3769                 }
3770
3771                 for (x = startx;x < endx;x++)
3772                 {
3773                         z = buffer_z[x];
3774                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3775                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3776                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3777                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3778                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3779                         {
3780                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3781                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3782                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3783                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3784                         }
3785                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3786                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3787                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3788                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3789                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3790                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3791                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3792                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3793
3794                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3795                         {
3796                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3797                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3798                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3799                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3800
3801                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3802                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3803                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3804                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3805
3806                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3807                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3808                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3809                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3810
3811                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3812                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3813                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3814                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3815
3816                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3817                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3818
3819                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3820                                 {
3821                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3822                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3823                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3824                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3825                                 }
3826                         }
3827                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3828                         {
3829                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3830                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3831                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3832                                 {
3833                                         float f = 1.0f / 256.0f;
3834                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3835                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3836                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3837                                 }
3838                         }
3839                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3840                         {
3841                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3842                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3843                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3844                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3845
3846                                 LightColor[0] = 1.0;
3847                                 LightColor[1] = 1.0;
3848                                 LightColor[2] = 1.0;
3849                         }
3850                         else
3851                         {
3852                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3853                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3854                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3855                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3856                         }
3857
3858                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3859
3860                         if(thread->shader_exactspecularmath)
3861                         {
3862                                 // reflect lightnormal at surfacenormal, take the negative of that
3863                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3864                                 float f;
3865                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3866                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3867                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3868                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3869
3870                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
3871                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3872                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3873                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3874                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3875
3876                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3877                         }
3878                         else
3879                         {
3880                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3881                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3882                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3883                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3884
3885                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
3886                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
3887                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
3888                                 DPSOFTRAST_Vector3Normalize(specularnormal);
3889
3890                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3891                         }
3892                         specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
3893
3894                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3895                         {
3896                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3897                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3898                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3899                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3900                         }
3901                         else
3902                         {
3903                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3904                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3905                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3906                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3907                         }
3908
3909                         buffer_FragColorbgra8[x*4+0] = d[0];
3910                         buffer_FragColorbgra8[x*4+1] = d[1];
3911                         buffer_FragColorbgra8[x*4+2] = d[2];
3912                         buffer_FragColorbgra8[x*4+3] = d[3];
3913                 }
3914         }
3915         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3916         {
3917                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3918                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3919                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3920                 Color_Diffuse[3] = 0.0f;
3921                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3922                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3923                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3924                 LightColor[3] = 0.0f;
3925                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3926
3927                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3928                 {
3929                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3930                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3931                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3932                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3933                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3934                 }
3935                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3936                 {
3937                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3938                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3939                 }
3940                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3941                 {
3942                         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3943                 }
3944                 else
3945                 {
3946                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3947                 }
3948
3949                 for (x = startx;x < endx;x++)
3950                 {
3951                         z = buffer_z[x];
3952                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3953                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3954                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3955                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3956                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3957                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3958                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3959                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3960
3961                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3962                         {
3963                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3964                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3965                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3966                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3967
3968                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3969                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3970                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3971                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3972
3973                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3974                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3975                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3976                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3977
3978                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3979                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3980                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3981                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3982
3983                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3984                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3985
3986                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3987                                 {
3988                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3989                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3990                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3991                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3992                                 }
3993                         }
3994                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3995                         {
3996                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3997                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3998                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3999                                 {
4000                                         float f = 1.0f / 256.0f;
4001                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
4002                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
4003                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
4004                                 }
4005                         }
4006                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
4007                         {
4008                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4009                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4010                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4011                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4012
4013                                 LightColor[0] = 1.0;
4014                                 LightColor[1] = 1.0;
4015                                 LightColor[2] = 1.0;
4016                         }
4017                         else
4018                         {
4019                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4020                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4021                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4022                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4023                         }
4024
4025                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4026                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4027                         {
4028                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4029                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4030                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4031                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4032                         }
4033                         else
4034                         {
4035                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4036                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4037                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4038                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4039                         }
4040                         buffer_FragColorbgra8[x*4+0] = d[0];
4041                         buffer_FragColorbgra8[x*4+1] = d[1];
4042                         buffer_FragColorbgra8[x*4+2] = d[2];
4043                         buffer_FragColorbgra8[x*4+3] = d[3];
4044                 }
4045         }
4046         else
4047         {
4048                 for (x = startx;x < endx;x++)
4049                 {
4050                         // z = buffer_z[x];
4051                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4052                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4053                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4054                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4055
4056                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4057                         {
4058                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4059                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4060                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4061                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4062                         }
4063                         else
4064                         {
4065                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4066                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4067                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4068                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4069                         }
4070                         buffer_FragColorbgra8[x*4+0] = d[0];
4071                         buffer_FragColorbgra8[x*4+1] = d[1];
4072                         buffer_FragColorbgra8[x*4+2] = d[2];
4073                         buffer_FragColorbgra8[x*4+3] = d[3];
4074                 }
4075         }
4076         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4077 }
4078
4079
4080
4081 static void DPSOFTRAST_VertexShader_LightSource(void)
4082 {
4083         int i;
4084         int numvertices = dpsoftrast.numvertices;
4085         float LightPosition[4];
4086         float LightVector[4];
4087         float LightVectorModelSpace[4];
4088         float EyePosition[4];
4089         float EyeVectorModelSpace[4];
4090         float EyeVector[4];
4091         float position[4];
4092         float svector[4];
4093         float tvector[4];
4094         float normal[4];
4095         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4096         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4097         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4098         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4099         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4100         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4101         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4102         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4103         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4104         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4105         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4106         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4107         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4108         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4109         for (i = 0;i < numvertices;i++)
4110         {
4111                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4112                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4113                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4114                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4115                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4116                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4117                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4118                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4119                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4120                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4121                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4122                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4123                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4124                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4125                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4126                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4127                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4128                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
4129                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4130                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4131                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4132                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4133                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4134                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4135                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4136                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4137                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4138                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4139                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4140                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4141                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4142                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4143         }
4144         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4145         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4146 }
4147
4148 static void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4149 {
4150 #ifdef SSE_POSSIBLE
4151         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4152         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4153         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4154         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4155         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4156         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4157         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4158         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4159         int x, startx = span->startx, endx = span->endx;
4160         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], /*Color_Glow[4],*/ Color_Pants[4], Color_Shirt[4], LightColor[4];
4161         float CubeVectordata[4];
4162         float CubeVectorslope[4];
4163         float LightVectordata[4];
4164         float LightVectorslope[4];
4165         float EyeVectordata[4];
4166         float EyeVectorslope[4];
4167         float z;
4168         float diffusetex[4];
4169         float glosstex[4];
4170         float surfacenormal[4];
4171         float lightnormal[4];
4172         float eyenormal[4];
4173         float specularnormal[4];
4174         float diffuse;
4175         float specular;
4176         float SpecularPower;
4177         float CubeVector[4];
4178         float attenuation;
4179         int d[4];
4180 #if 0
4181         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4182         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4183         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4184         Color_Glow[3] = 0.0f;
4185 #endif
4186         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4187         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4188         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4189         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4190         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4191         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4192         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4193         Color_Diffuse[3] = 0.0f;
4194         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4195         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4196         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4197         Color_Specular[3] = 0.0f;
4198         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4199         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4200         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4201         Color_Pants[3] = 0.0f;
4202         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4203         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4204         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4205         Color_Shirt[3] = 0.0f;
4206         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4207         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4208         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4209         LightColor[3] = 0.0f;
4210         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4211         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4212         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4213         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4214         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4215         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4216         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4217         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4218         {
4219                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4220                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4221         }
4222         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4223                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4224         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4225         {
4226                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4227                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4228                 for (x = startx;x < endx;x++)
4229                 {
4230                         z = buffer_z[x];
4231                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4232                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4233                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4234                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4235                         if (attenuation < 0.01f)
4236                                 continue;
4237                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4238                         {
4239                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4240                                 if (attenuation < 0.01f)
4241                                         continue;
4242                         }
4243
4244                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4245                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4246                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4247                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4248                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4249                         {
4250                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4251                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4252                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4253                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4254                         }
4255                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4256                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4257                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4258                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4259                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4260                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4261                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4262                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4263
4264                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4265                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4266                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4267                         DPSOFTRAST_Vector3Normalize(lightnormal);
4268
4269                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4270
4271                         if(thread->shader_exactspecularmath)
4272                         {
4273                                 // reflect lightnormal at surfacenormal, take the negative of that
4274                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4275                                 float f;
4276                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4277                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4278                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4279                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4280
4281                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
4282                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4283                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4284                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4285                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4286
4287                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4288                         }
4289                         else
4290                         {
4291                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4292                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4293                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4294                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4295
4296                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
4297                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
4298                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
4299                                 DPSOFTRAST_Vector3Normalize(specularnormal);
4300
4301                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4302                         }
4303                         specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
4304
4305                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4306                         {
4307                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4308                                 attenuation *= (1.0f / 255.0f);
4309                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4310                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4311                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4312                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4313                         }
4314                         else
4315                         {
4316                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4317                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4318                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4319                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4320                         }
4321                         buffer_FragColorbgra8[x*4+0] = d[0];
4322                         buffer_FragColorbgra8[x*4+1] = d[1];
4323                         buffer_FragColorbgra8[x*4+2] = d[2];
4324                         buffer_FragColorbgra8[x*4+3] = d[3];
4325                 }
4326         }
4327         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4328         {
4329                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4330                 for (x = startx;x < endx;x++)
4331                 {
4332                         z = buffer_z[x];
4333                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4334                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4335                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4336                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4337                         if (attenuation < 0.01f)
4338                                 continue;
4339                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4340                         {
4341                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4342                                 if (attenuation < 0.01f)
4343                                         continue;
4344                         }
4345
4346                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4347                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4348                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4349                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4350                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4351                         {
4352                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4353                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4354                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4355                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4356                         }
4357                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4358                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4359                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4360                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4361
4362                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4363                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4364                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4365                         DPSOFTRAST_Vector3Normalize(lightnormal);
4366
4367                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4368                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4369                         {
4370                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4371                                 attenuation *= (1.0f / 255.0f);
4372                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4373                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4374                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4375                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4376                         }
4377                         else
4378                         {
4379                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4380                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4381                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4382                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4383                         }
4384                         buffer_FragColorbgra8[x*4+0] = d[0];
4385                         buffer_FragColorbgra8[x*4+1] = d[1];
4386                         buffer_FragColorbgra8[x*4+2] = d[2];
4387                         buffer_FragColorbgra8[x*4+3] = d[3];
4388                 }
4389         }
4390         else
4391         {
4392                 for (x = startx;x < endx;x++)
4393                 {
4394                         z = buffer_z[x];
4395                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4396                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4397                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4398                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4399                         if (attenuation < 0.01f)
4400                                 continue;
4401                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4402                         {
4403                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4404                                 if (attenuation < 0.01f)
4405                                         continue;
4406                         }
4407
4408                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4409                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4410                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4411                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4412                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4413                         {
4414                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4415                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4416                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4417                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4418                         }
4419                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4420                         {
4421                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4422                                 attenuation *= (1.0f / 255.0f);
4423                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4424                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4425                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4426                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4427                         }
4428                         else
4429                         {
4430                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4431                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4432                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4433                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4434                         }
4435                         buffer_FragColorbgra8[x*4+0] = d[0];
4436                         buffer_FragColorbgra8[x*4+1] = d[1];
4437                         buffer_FragColorbgra8[x*4+2] = d[2];
4438                         buffer_FragColorbgra8[x*4+3] = d[3];
4439                 }
4440         }
4441         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4442 #endif
4443 }
4444
4445
4446
4447 static void DPSOFTRAST_VertexShader_Refraction(void)
4448 {
4449         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4450         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4451         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4452 }
4453
4454 static void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4455 {
4456         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4457         //float z;
4458         int x, startx = span->startx, endx = span->endx;
4459
4460         // texture reads
4461         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4462         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4463
4464         // varyings
4465         float ModelViewProjectionPositiondata[4];
4466         float ModelViewProjectionPositionslope[4];
4467
4468         // uniforms
4469         float ScreenScaleRefractReflect[2];
4470         float ScreenCenterRefractReflect[2];
4471         float DistortScaleRefractReflect[2];
4472         float RefractColor[4];
4473
4474         DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4475         if(!texture) return;
4476
4477         // read textures
4478         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4479         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4480
4481         // read varyings
4482         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4483
4484         // read uniforms
4485         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4486         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4487         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4488         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4489         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4490         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4491         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4492         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4493         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4494         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4495
4496         // do stuff
4497         for (x = startx;x < endx;x++)
4498         {
4499                 float SafeScreenTexCoord[2];
4500                 float ScreenTexCoord[2];
4501                 float v[3];
4502                 float iw;
4503                 unsigned char c[4];
4504
4505                 //z = buffer_z[x];
4506
4507                 // "    vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4508                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4509
4510                 // "    vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4511                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4512                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4513
4514                 // "    vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4515                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4516                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4517                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4518                 DPSOFTRAST_Vector3Normalize(v);
4519                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4520                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4521
4522                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4523                 DPSOFTRAST_Texture2DBGRA8(texture, 0, ScreenTexCoord[0], ScreenTexCoord[1], c);
4524
4525                 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4526                 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4527                 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4528                 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4529         }
4530
4531         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4532 }
4533
4534
4535
4536 static void DPSOFTRAST_VertexShader_Water(void)
4537 {
4538         int i;
4539         int numvertices = dpsoftrast.numvertices;
4540         float EyePosition[4];
4541         float EyeVectorModelSpace[4];
4542         float EyeVector[4];
4543         float position[4];
4544         float svector[4];
4545         float tvector[4];
4546         float normal[4];
4547         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4548         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4549         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4550         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4551         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4552         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4553         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4554         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4555         for (i = 0;i < numvertices;i++)
4556         {
4557                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4558                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4559                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4560                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4561                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4562                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4563                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4564                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4565                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4566                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4567                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4568                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4569                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4570                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4571                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4572                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4573                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4574                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4575                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
4576                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
4577                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
4578                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
4579         }
4580         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4581         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4582         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4583 }
4584
4585
4586 static void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4587 {
4588         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4589         // float z;
4590         int x, startx = span->startx, endx = span->endx;
4591
4592         // texture reads
4593         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4594         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4595
4596         // varyings
4597         float ModelViewProjectionPositiondata[4];
4598         float ModelViewProjectionPositionslope[4];
4599         float EyeVectordata[4];
4600         float EyeVectorslope[4];
4601
4602         // uniforms
4603         float ScreenScaleRefractReflect[4];
4604         float ScreenCenterRefractReflect[4];
4605         float DistortScaleRefractReflect[4];
4606         float RefractColor[4];
4607         float ReflectColor[4];
4608         float ReflectFactor;
4609         float ReflectOffset;
4610
4611         DPSOFTRAST_Texture *texture_refraction = thread->texbound[GL20TU_REFRACTION];
4612         DPSOFTRAST_Texture *texture_reflection = thread->texbound[GL20TU_REFLECTION];
4613         if(!texture_refraction || !texture_reflection) return;
4614
4615         // read textures
4616         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4617         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4618
4619         // read varyings
4620         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4621         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
4622
4623         // read uniforms
4624         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4625         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4626         ScreenScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+2];
4627         ScreenScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+3];
4628         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4629         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4630         ScreenCenterRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+2];
4631         ScreenCenterRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+3];
4632         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4633         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4634         DistortScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+2];
4635         DistortScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+3];
4636         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4637         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4638         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4639         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4640         ReflectColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+2];
4641         ReflectColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+1];
4642         ReflectColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+0];
4643         ReflectColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+3];
4644         ReflectFactor = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectFactor*4+0];
4645         ReflectOffset = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectOffset*4+0];
4646
4647         // do stuff
4648         for (x = startx;x < endx;x++)
4649         {
4650                 float SafeScreenTexCoord[4];
4651                 float ScreenTexCoord[4];
4652                 float v[3];
4653                 float iw;
4654                 unsigned char c1[4];
4655                 unsigned char c2[4];
4656                 float Fresnel;
4657
4658                 // z = buffer_z[x];
4659
4660                 // "    vec4 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect * (1.0 / ModelViewProjectionPosition.w);\n"
4661                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4662
4663                 // "    vec4 SafeScreenTexCoord = ModelViewProjectionPosition.xyxy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect;\n"
4664                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4665                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4666                 SafeScreenTexCoord[2] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[2] + ScreenCenterRefractReflect[2]; // * z (disappears)
4667                 SafeScreenTexCoord[3] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[3] + ScreenCenterRefractReflect[3]; // * z (disappears)
4668
4669                 // "    vec4 ScreenTexCoord = SafeScreenTexCoord + vec2(normalize(vec3(dp_texture2D(Texture_Normal, TexCoord)) - vec3(0.5))).xyxy * DistortScaleRefractReflect;\n"
4670                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4671                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4672                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4673                 DPSOFTRAST_Vector3Normalize(v);
4674                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4675                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4676                 ScreenTexCoord[2] = SafeScreenTexCoord[2] + v[0] * DistortScaleRefractReflect[2];
4677                 ScreenTexCoord[3] = SafeScreenTexCoord[3] + v[1] * DistortScaleRefractReflect[3];
4678
4679                 // "    float Fresnel = pow(min(1.0, 1.0 - float(normalize(EyeVector).z)), 2.0) * ReflectFactor + ReflectOffset;\n"
4680                 v[0] = (EyeVectordata[0] + EyeVectorslope[0] * x); // * z (disappears)
4681                 v[1] = (EyeVectordata[1] + EyeVectorslope[1] * x); // * z (disappears)
4682                 v[2] = (EyeVectordata[2] + EyeVectorslope[2] * x); // * z (disappears)
4683                 DPSOFTRAST_Vector3Normalize(v);
4684                 Fresnel = 1.0f - v[2];
4685                 Fresnel = min(1.0f, Fresnel);
4686                 Fresnel = Fresnel * Fresnel * ReflectFactor + ReflectOffset;
4687
4688                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4689                 // "    dp_FragColor = mix(vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord.xy).rgb, 1) * RefractColor, vec4(dp_texture2D(Texture_Reflection, ScreenTexCoord.zw).rgb, 1) * ReflectColor, Fresnel);\n"
4690                 DPSOFTRAST_Texture2DBGRA8(texture_refraction, 0, ScreenTexCoord[0], ScreenTexCoord[1], c1);
4691                 DPSOFTRAST_Texture2DBGRA8(texture_reflection, 0, ScreenTexCoord[2], ScreenTexCoord[3], c2);
4692
4693                 buffer_FragColorbgra8[x*4+0] = (c1[0] * RefractColor[0]) * (1.0f - Fresnel) + (c2[0] * ReflectColor[0]) * Fresnel;
4694                 buffer_FragColorbgra8[x*4+1] = (c1[1] * RefractColor[1]) * (1.0f - Fresnel) + (c2[1] * ReflectColor[1]) * Fresnel;
4695                 buffer_FragColorbgra8[x*4+2] = (c1[2] * RefractColor[2]) * (1.0f - Fresnel) + (c2[2] * ReflectColor[2]) * Fresnel;
4696                 buffer_FragColorbgra8[x*4+3] = min((    RefractColor[3] *  (1.0f - Fresnel) +          ReflectColor[3]  * Fresnel) * 256, 255);
4697         }
4698
4699         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4700 }
4701
4702
4703
4704 static void DPSOFTRAST_VertexShader_ShowDepth(void)
4705 {
4706         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4707 }
4708
4709 static void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4710 {
4711         // TODO: IMPLEMENT
4712         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4713         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4714         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4715         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4716         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4717 }
4718
4719
4720
4721 static void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4722 {
4723         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4724 }
4725
4726 static void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4727 {
4728         // TODO: IMPLEMENT
4729         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4730         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4731         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4732         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4733         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4734 }
4735
4736
4737
4738 static void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4739 {
4740         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4741 }
4742
4743 static void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4744 {
4745         // TODO: IMPLEMENT
4746         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4747         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4748         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4749         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4750         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4751 }
4752
4753
4754
4755 typedef struct DPSOFTRAST_ShaderModeInfo_s
4756 {
4757         int lodarrayindex;
4758         void (*Vertex)(void);
4759         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4760         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4761         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4762 }
4763 DPSOFTRAST_ShaderModeInfo;
4764
4765 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4766 {
4767         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4768         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4769         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4770         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4771         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4772         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4773         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4774         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4775         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4776         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4777         {2, DPSOFTRAST_VertexShader_VertexColor,                        DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4778         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4779         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4780         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4781         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_NORMAL, GL20TU_REFLECTION, GL20TU_REFRACTION, ~0}},
4782         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4783         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4784         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4785 };
4786
4787 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4788 {
4789         int x;
4790         int startx;
4791         int endx;
4792         unsigned int *depthpixel;
4793         int depth;
4794         int depthslope;
4795         unsigned int d;
4796         unsigned char *pixelmask;
4797         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4798         startx = span->startx;
4799         endx = span->endx;
4800         depth = span->depthbase;
4801         depthslope = span->depthslope;
4802         pixelmask = thread->pixelmaskarray;
4803         if (thread->depthtest && dpsoftrast.fb_depthpixels)
4804         {
4805                 switch(thread->fb_depthfunc)
4806                 {
4807                 default:
4808                 case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4809                 case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4810                 case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4811                 case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4812                 case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4813                 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4814                 case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4815                 }
4816                 while (startx < endx && !pixelmask[startx])
4817                         startx++;
4818                 while (endx > startx && !pixelmask[endx-1])
4819                         endx--;
4820         }
4821         else
4822         {
4823                 // no depth testing means we're just dealing with color...
4824                 memset(pixelmask + startx, 1, endx - startx);
4825         }
4826         span->pixelmask = pixelmask;
4827         span->startx = startx;
4828         span->endx = endx;
4829 }
4830
4831 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4832 {
4833         int x, d, depth, depthslope, startx, endx;
4834         const unsigned char *pixelmask;
4835         unsigned int *depthpixel;
4836         if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4837         {
4838                 depth = span->depthbase;
4839                 depthslope = span->depthslope;
4840                 pixelmask = span->pixelmask;
4841                 startx = span->startx;
4842                 endx = span->endx;
4843                 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4844                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4845                         if (pixelmask[x])
4846                                 depthpixel[x] = d;
4847         }
4848 }
4849
4850 static void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4851 {
4852         int i;
4853         DPSOFTRAST_State_Triangle *triangle;
4854         DPSOFTRAST_State_Span *span;
4855         for (i = 0; i < thread->numspans; i++)
4856         {
4857                 span = &thread->spans[i];
4858                 triangle = &thread->triangles[span->triangle];
4859                 DPSOFTRAST_Draw_DepthTest(thread, span);
4860                 if (span->startx >= span->endx)
4861                         continue;
4862                 // run pixel shader if appropriate
4863                 // do this before running depthmask code, to allow the pixelshader
4864                 // to clear pixelmask values for alpha testing
4865                 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4866                         DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4867                 DPSOFTRAST_Draw_DepthWrite(thread, span);
4868         }
4869         thread->numspans = 0;
4870 }
4871
4872 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;)
4873
4874 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4875 {
4876 #ifdef SSE_POSSIBLE
4877         int cullface = thread->cullface;
4878         int minx, maxx, miny, maxy;
4879         int miny1, maxy1, miny2, maxy2;
4880         __m128i fbmin, fbmax;
4881         __m128 viewportcenter, viewportscale;
4882         int firstvertex = command->firstvertex;
4883         int numvertices = command->numvertices;
4884         int numtriangles = command->numtriangles;
4885         const int *element3i = command->element3i;
4886         const unsigned short *element3s = command->element3s;
4887         int clipped = command->clipped;
4888         int i;
4889         int j;
4890         int k;
4891         int y;
4892         int e[3];
4893         __m128i screeny;
4894         int starty, endy, bandy;
4895         int numpoints;
4896         int clipcase;
4897         float clipdist[4];
4898         float clip0origin, clip0slope;
4899         int clip0dir;
4900         __m128 triangleedge1, triangleedge2, trianglenormal;
4901         __m128 clipfrac[3];
4902         __m128 screen[4];
4903         DPSOFTRAST_State_Triangle *triangle;
4904         DPSOFTRAST_Texture *texture;
4905         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4906         miny = thread->fb_scissor[1];
4907         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4908         miny1 = bound(miny, thread->miny1, maxy);
4909         maxy1 = bound(miny, thread->maxy1, maxy);
4910         miny2 = bound(miny, thread->miny2, maxy);
4911         maxy2 = bound(miny, thread->maxy2, maxy);
4912         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4913         {
4914                 if (!ATOMIC_DECREMENT(command->refcount))
4915                 {
4916                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4917                                 MM_FREE(command->arrays);
4918                 }
4919                 return;
4920         }
4921         minx = thread->fb_scissor[0];
4922         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4923         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4924         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4925         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4926         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4927         screen[3] = _mm_setzero_ps();
4928         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4929         for (i = 0;i < numtriangles;i++)
4930         {
4931                 const float *screencoord4f = command->arrays;
4932                 const float *arrays = screencoord4f + numvertices*4;
4933
4934                 // generate the 3 edges of this triangle
4935                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4936                 if (element3s)
4937                 {
4938                         e[0] = element3s[i*3+0] - firstvertex;
4939                         e[1] = element3s[i*3+1] - firstvertex;
4940                         e[2] = element3s[i*3+2] - firstvertex;
4941                 }
4942                 else if (element3i)
4943                 {
4944                         e[0] = element3i[i*3+0] - firstvertex;
4945                         e[1] = element3i[i*3+1] - firstvertex;
4946                         e[2] = element3i[i*3+2] - firstvertex;
4947                 }
4948                 else
4949                 {
4950                         e[0] = i*3+0;
4951                         e[1] = i*3+1;
4952                         e[2] = i*3+2;
4953                 }
4954
4955 #define SKIPBACKFACE \
4956                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4957                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4958                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4959                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4960                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4961                 switch(cullface) \
4962                 { \
4963                 case GL_BACK: \
4964                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4965                                 continue; \
4966                         break; \
4967                 case GL_FRONT: \
4968                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4969                                 continue; \
4970                         break; \
4971                 }
4972
4973 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4974                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4975                         { \
4976                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4977                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4978                         }
4979 #define CLIPPEDVERTEXCOPY(k,p1) \
4980                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4981
4982 #define GENATTRIBCOPY(attrib, p1) \
4983                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4984 #define GENATTRIBLERP(attrib, p1, p2) \
4985                 { \
4986                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4987                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4988                 }
4989 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4990                 switch(clipcase) \
4991                 { \
4992                 default: \
4993                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4994                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4995                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4996                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4997                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4998                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4999                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
5000                 }
5001
5002                 if (! clipped)
5003                         goto notclipped;
5004
5005                 // calculate distance from nearplane
5006                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
5007                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
5008                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
5009                 if (clipdist[0] >= 0.0f)
5010                 {
5011                         if (clipdist[1] >= 0.0f)
5012                         {
5013                                 if (clipdist[2] >= 0.0f)
5014                                 {
5015                                 notclipped:
5016                                         // triangle is entirely in front of nearplane
5017                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
5018                                         SKIPBACKFACE;
5019                                         numpoints = 3;
5020                                         clipcase = 0;
5021                                 }
5022                                 else
5023                                 {
5024                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
5025                                         SKIPBACKFACE;
5026                                         numpoints = 4;
5027                                         clipcase = 1;
5028                                 }
5029                         }
5030                         else
5031                         {
5032                                 if (clipdist[2] >= 0.0f)
5033                                 {
5034                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
5035                                         SKIPBACKFACE;
5036                                         numpoints = 4;
5037                                         clipcase = 2;
5038                                 }
5039                                 else
5040                                 {
5041                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
5042                                         SKIPBACKFACE;
5043                                         numpoints = 3;
5044                                         clipcase = 3;
5045                                 }
5046                         }
5047                 }
5048                 else if (clipdist[1] >= 0.0f)
5049                 {
5050                         if (clipdist[2] >= 0.0f)
5051                         {
5052                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
5053                                 SKIPBACKFACE;
5054                                 numpoints = 4;
5055                                 clipcase = 4;
5056                         }
5057                         else
5058                         {
5059                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
5060                                 SKIPBACKFACE;
5061                                 numpoints = 3;
5062                                 clipcase = 5;
5063                         }
5064                 }
5065                 else if (clipdist[2] >= 0.0f)
5066                 {
5067                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
5068                         SKIPBACKFACE;
5069                         numpoints = 3;
5070                         clipcase = 6;
5071                 }
5072                 else continue; // triangle is entirely behind nearplane
5073
5074                 {
5075                         // calculate integer y coords for triangle points
5076                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
5077                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
5078                                         screenmin = _mm_min_epi16(screeni, screenir),
5079                                         screenmax = _mm_max_epi16(screeni, screenir);
5080                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
5081                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
5082                         screenmin = _mm_max_epi16(screenmin, fbmin);
5083                         screenmax = _mm_min_epi16(screenmax, fbmax);
5084                         // skip offscreen triangles
5085                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5086                                 continue;
5087                         starty = _mm_extract_epi16(screenmin, 1);
5088                         endy = _mm_extract_epi16(screenmax, 1)+1;
5089                         if (starty >= maxy1 && endy <= miny2)
5090                                 continue;
5091                         screeny = _mm_srai_epi32(screeni, 16);
5092                 }
5093
5094                 triangle = &thread->triangles[thread->numtriangles];
5095
5096                 // calculate attribute plans for triangle data...
5097                 // okay, this triangle is going to produce spans, we'd better project
5098                 // the interpolants now (this is what gives perspective texturing),
5099                 // this consists of simply multiplying all arrays by the W coord
5100                 // (which is basically 1/Z), which will be undone per-pixel
5101                 // (multiplying by Z again) to get the perspective-correct array
5102                 // values
5103                 {
5104                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5105                         __m128 mipedgescale, mipdensity;
5106                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5107                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5108                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5109                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5110                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5111                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5112                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5113                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5114                         attribedge1 = _mm_sub_ss(w0, w1);
5115                         attribedge2 = _mm_sub_ss(w2, w1);
5116                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5117                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5118                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5119                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5120                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5121                         _mm_store_ss(&triangle->w[0], attribxslope);
5122                         _mm_store_ss(&triangle->w[1], attribyslope);
5123                         _mm_store_ss(&triangle->w[2], attriborigin);
5124                         
5125                         clip0origin = 0;
5126                         clip0slope = 0;
5127                         clip0dir = 0;
5128                         if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
5129                         {
5130                                 float cliporigin, clipxslope, clipyslope;
5131                                 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5132                                 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5133                                 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5134                                 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5135                                 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5136                                 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5137                                 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
5138                                 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
5139                                 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
5140                                 if(clipxslope != 0)
5141                                 {
5142                                         clip0origin = -cliporigin/clipxslope;
5143                                         clip0slope = -clipyslope/clipxslope;
5144                                         clip0dir = clipxslope > 0 ? 1 : -1;
5145                                 }
5146                                 else if(clipyslope > 0)
5147                                 {
5148                                         clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5149                                         clip0slope = dpsoftrast.fb_width;
5150                                         clip0dir = -1;
5151                                 }
5152                                 else if(clipyslope < 0)
5153                                 {
5154                                         clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5155                                         clip0slope = -dpsoftrast.fb_width;
5156                                         clip0dir = -1;
5157                                 }
5158                                 else if(clip0origin < 0) continue;
5159                         }
5160
5161                         mipedgescale = _mm_setzero_ps();
5162                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5163                         {
5164                                 __m128 attrib0, attrib1, attrib2;
5165                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5166                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5167                                         break;
5168                                 arrays += numvertices*4;
5169                                 GENATTRIBS(attrib0, attrib1, attrib2);
5170                                 attriborigin = _mm_mul_ps(attrib1, w1);
5171                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5172                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5173                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5174                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5175                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5176                                 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5177                                 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5178                                 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5179                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5180                                 {
5181                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5182                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5183                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5184                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5185                                 }
5186                         }
5187
5188                         memset(triangle->mip, 0, sizeof(triangle->mip));
5189                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5190                         {
5191                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5192                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5193                                         break;
5194                                 texture = thread->texbound[texunit];
5195                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5196                                 {
5197                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5198                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5199                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5200                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5201                                         // this will be multiplied in the texturing routine by the texture resolution
5202                                         y = _mm_cvtss_si32(mipdensity);
5203                                         if (y > 0)
5204                                         {
5205                                                 y = (int)(log((float)y)*0.5f/M_LN2);
5206                                                 if (y > texture->mipmaps - 1)
5207                                                         y = texture->mipmaps - 1;
5208                                                 triangle->mip[texunit] = y;
5209                                         }
5210                                 }
5211                         }
5212                 }
5213         
5214                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5215                 for (; y < bandy;)
5216                 {
5217                         __m128 xcoords, xslope;
5218                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5219                         int yccmask = _mm_movemask_epi8(ycc);
5220                         int edge0p, edge0n, edge1p, edge1n;
5221                         int nexty;
5222                         float w, wslope;
5223                         float clip0;
5224                         if (numpoints == 4)
5225                         {
5226                                 switch(yccmask)
5227                                 {
5228                                 default:
5229                                 case 0xFFFF: /*0000*/ y = endy; continue;
5230                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5231                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5232                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5233                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5234                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5235                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5236                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5237                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5238                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5239                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5240                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5241                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5242                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5243                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5244                                 case 0x0000: /*1111*/ y++; continue;
5245                                 }
5246                         }
5247                         else
5248                         {
5249                                 switch(yccmask)
5250                                 {
5251                                 default:
5252                                 case 0xFFFF: /*000*/ y = endy; continue;
5253                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5254                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5255                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5256                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5257                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5258                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5259                                 case 0x0000: /*111*/ y++; continue;
5260                                 }
5261                         }
5262                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5263                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5264                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5265                         nexty = _mm_extract_epi16(ycc, 0);
5266                         if (nexty >= bandy) nexty = bandy-1;
5267                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5268                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5269                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5270                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5271                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5272                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5273                         {
5274                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5275                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5276                         }
5277                         clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5278                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5279                         {
5280                                 int startx, endx, offset;
5281                                 startx = _mm_cvtss_si32(xcoords);
5282                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5283                                 if (startx < minx) startx = minx;
5284                                 if (endx > maxx) endx = maxx;
5285                                 if (startx >= endx) continue;
5286
5287                                 if (clip0dir)
5288                                 {
5289                                         if (clip0dir > 0)
5290                                         {
5291                                                 if (startx < clip0) 
5292                                                 {
5293                                                         if(endx <= clip0) continue;
5294                                                         startx = (int)clip0;
5295                                                 }
5296                                         }
5297                                         else if (endx > clip0) 
5298                                         {
5299                                                 if(startx >= clip0) continue;
5300                                                 endx = (int)clip0;
5301                                         }
5302                                 }
5303                                                 
5304                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5305                                 {
5306                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5307                                         span->triangle = thread->numtriangles;
5308                                         span->x = offset;
5309                                         span->y = y;
5310                                         span->startx = 0;
5311                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5312                                         if (span->startx >= span->endx)
5313                                                 continue;
5314                                         wslope = triangle->w[0];
5315                                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5316                                         span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5317                                         span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5318                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5319                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
5320                                 }
5321                         }
5322                 }
5323
5324                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5325                 {
5326                         DPSOFTRAST_Draw_ProcessSpans(thread);
5327                         thread->numtriangles = 0;
5328                 }
5329         }
5330
5331         if (!ATOMIC_DECREMENT(command->refcount))
5332         {
5333                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5334                         MM_FREE(command->arrays);
5335         }
5336
5337         if (thread->numspans > 0 || thread->numtriangles > 0)
5338         {
5339                 DPSOFTRAST_Draw_ProcessSpans(thread);
5340                 thread->numtriangles = 0;
5341         }
5342 #endif
5343 }
5344
5345 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5346 {
5347         int i;
5348         int j;
5349         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5350         int datasize = 2*numvertices*sizeof(float[4]);
5351         DPSOFTRAST_Command_Draw *command;
5352         unsigned char *data;
5353         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5354         {
5355                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5356                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5357                         break;
5358                 datasize += numvertices*sizeof(float[4]);
5359         }
5360         if (element3s)
5361                 datasize += numtriangles*sizeof(unsigned short[3]);
5362         else if (element3i)
5363                 datasize += numtriangles*sizeof(int[3]);
5364         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5365         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5366         {
5367                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5368                 data = (unsigned char *)MM_CALLOC(datasize, 1);
5369         }
5370         else
5371         {
5372                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5373                 data = (unsigned char *)command + commandsize;
5374         }
5375         command->firstvertex = firstvertex;
5376         command->numvertices = numvertices;
5377         command->numtriangles = numtriangles;
5378         command->arrays = (float *)data;
5379         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5380         dpsoftrast.firstvertex = firstvertex;
5381         dpsoftrast.numvertices = numvertices;
5382         dpsoftrast.screencoord4f = (float *)data;
5383         data += numvertices*sizeof(float[4]);
5384         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5385         data += numvertices*sizeof(float[4]);
5386         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5387         {
5388                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5389                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5390                         break;
5391                 dpsoftrast.post_array4f[j] = (float *)data;
5392                 data += numvertices*sizeof(float[4]);
5393         }
5394         command->element3i = NULL;
5395         command->element3s = NULL;
5396         if (element3s)
5397         {
5398                 command->element3s = (unsigned short *)data;
5399                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5400         }
5401         else if (element3i)
5402         {
5403                 command->element3i = (int *)data;
5404                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5405         }
5406         return command;
5407 }
5408
5409 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5410 {
5411         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5412         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5413         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5414         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5415         if (command->starty >= command->endy)
5416         {
5417                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5418                         MM_FREE(command->arrays);
5419                 DPSOFTRAST_UndoCommand(command->commandsize);
5420                 return;
5421         }
5422         command->clipped = dpsoftrast.drawclipped;
5423         command->refcount = dpsoftrast.numthreads;
5424
5425         if (dpsoftrast.usethreads)
5426         {
5427                 int i;
5428                 DPSOFTRAST_Draw_SyncCommands();
5429                 for (i = 0; i < dpsoftrast.numthreads; i++)
5430                 {
5431                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5432                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5433                                 Thread_CondSignal(thread->drawcond);
5434                 }
5435         }
5436         else
5437         {
5438                 DPSOFTRAST_Draw_FlushThreads();
5439         }
5440 }
5441
5442 DEFCOMMAND(23, SetRenderTargets, int width; int height;)
5443 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5444 {
5445         thread->validate |= DPSOFTRAST_VALIDATE_FB;
5446 }
5447 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5448 {
5449         DPSOFTRAST_Command_SetRenderTargets *command;
5450         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5451                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5452                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5453                 DPSOFTRAST_Flush();
5454         dpsoftrast.fb_width = width;
5455         dpsoftrast.fb_height = height;
5456         dpsoftrast.fb_depthpixels = depthpixels;
5457         dpsoftrast.fb_colorpixels[0] = colorpixels0;
5458         dpsoftrast.fb_colorpixels[1] = colorpixels1;
5459         dpsoftrast.fb_colorpixels[2] = colorpixels2;
5460         dpsoftrast.fb_colorpixels[3] = colorpixels3;
5461         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5462         command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5463         command->width = width;
5464         command->height = height;
5465 }
5466  
5467 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5468 {
5469         int commandoffset = thread->commandoffset;
5470         while (commandoffset != endoffset)
5471         {
5472                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5473                 switch (command->opcode)
5474                 {
5475 #define INTERPCOMMAND(name) \
5476                 case DPSOFTRAST_OPCODE_##name : \
5477                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5478                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5479                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5480                                 commandoffset = 0; \
5481                         break;
5482                 INTERPCOMMAND(Viewport)
5483                 INTERPCOMMAND(ClearColor)
5484                 INTERPCOMMAND(ClearDepth)
5485                 INTERPCOMMAND(ColorMask)
5486                 INTERPCOMMAND(DepthTest)
5487                 INTERPCOMMAND(ScissorTest)
5488                 INTERPCOMMAND(Scissor)
5489                 INTERPCOMMAND(BlendFunc)
5490                 INTERPCOMMAND(BlendSubtract)
5491                 INTERPCOMMAND(DepthMask)
5492                 INTERPCOMMAND(DepthFunc)
5493                 INTERPCOMMAND(DepthRange)
5494                 INTERPCOMMAND(PolygonOffset)
5495                 INTERPCOMMAND(CullFace)
5496                 INTERPCOMMAND(SetTexture)
5497                 INTERPCOMMAND(SetShader)
5498                 INTERPCOMMAND(Uniform4f)
5499                 INTERPCOMMAND(UniformMatrix4f)
5500                 INTERPCOMMAND(Uniform1i)
5501                 INTERPCOMMAND(SetRenderTargets)
5502                 INTERPCOMMAND(ClipPlane)
5503
5504                 case DPSOFTRAST_OPCODE_Draw:
5505                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5506                         commandoffset += command->commandsize;
5507                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5508                                 commandoffset = 0;
5509                         thread->commandoffset = commandoffset;
5510                         break;
5511
5512                 case DPSOFTRAST_OPCODE_Reset:
5513                         commandoffset = 0;
5514                         break;
5515                 }
5516         }
5517         thread->commandoffset = commandoffset;
5518 }
5519
5520 static int DPSOFTRAST_Draw_Thread(void *data)
5521 {
5522         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5523         while(thread->index >= 0)
5524         {
5525                 if (thread->commandoffset != dpsoftrast.drawcommand)
5526                 {
5527                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5528                 }
5529                 else 
5530                 {
5531                         Thread_LockMutex(thread->drawmutex);
5532                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5533                         {
5534                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5535                                 thread->starving = true;
5536                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5537                                 thread->starving = false;
5538                         }
5539                         Thread_UnlockMutex(thread->drawmutex);
5540                 }
5541         }   
5542         return 0;
5543 }
5544
5545 static void DPSOFTRAST_Draw_FlushThreads(void)
5546 {
5547         DPSOFTRAST_State_Thread *thread;
5548         int i;
5549         DPSOFTRAST_Draw_SyncCommands();
5550         if (dpsoftrast.usethreads) 
5551         {
5552                 for (i = 0; i < dpsoftrast.numthreads; i++)
5553                 {
5554                         thread = &dpsoftrast.threads[i];
5555                         if (thread->commandoffset != dpsoftrast.drawcommand)
5556                         {
5557                                 Thread_LockMutex(thread->drawmutex);
5558                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5559                                         Thread_CondSignal(thread->drawcond);
5560                                 Thread_UnlockMutex(thread->drawmutex);
5561                         }
5562                 }
5563                 for (i = 0; i < dpsoftrast.numthreads; i++)
5564                 {
5565                         thread = &dpsoftrast.threads[i];
5566                         if (thread->commandoffset != dpsoftrast.drawcommand)
5567                         {
5568                                 Thread_LockMutex(thread->drawmutex);
5569                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5570                                 {
5571                                         thread->waiting = true;
5572                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5573                                         thread->waiting = false;
5574                                 }
5575                                 Thread_UnlockMutex(thread->drawmutex);
5576                         }
5577                 }
5578         }
5579         else
5580         {
5581                 for (i = 0; i < dpsoftrast.numthreads; i++)
5582                 {
5583                         thread = &dpsoftrast.threads[i];
5584                         if (thread->commandoffset != dpsoftrast.drawcommand)
5585                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5586                 }
5587         }
5588         dpsoftrast.commandpool.usedcommands = 0;
5589 }
5590
5591 void DPSOFTRAST_Flush(void)
5592 {
5593         DPSOFTRAST_Draw_FlushThreads();
5594 }
5595
5596 void DPSOFTRAST_Finish(void)
5597 {
5598         DPSOFTRAST_Flush();
5599 }
5600
5601 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5602 {
5603         int i;
5604         union
5605         {
5606                 int i;
5607                 unsigned char b[4];
5608         }
5609         u;
5610         u.i = 1;
5611         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5612         dpsoftrast.bigendian = u.b[3];
5613         dpsoftrast.fb_width = width;
5614         dpsoftrast.fb_height = height;
5615         dpsoftrast.fb_depthpixels = depthpixels;
5616         dpsoftrast.fb_colorpixels[0] = colorpixels;
5617         dpsoftrast.fb_colorpixels[1] = NULL;
5618         dpsoftrast.fb_colorpixels[1] = NULL;
5619         dpsoftrast.fb_colorpixels[1] = NULL;
5620         dpsoftrast.viewport[0] = 0;
5621         dpsoftrast.viewport[1] = 0;
5622         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5623         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5624         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5625         dpsoftrast.texture_firstfree = 1;
5626         dpsoftrast.texture_end = 1;
5627         dpsoftrast.texture_max = 0;
5628         dpsoftrast.color[0] = 1;
5629         dpsoftrast.color[1] = 1;
5630         dpsoftrast.color[2] = 1;
5631         dpsoftrast.color[3] = 1;
5632         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5633         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5634         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5635         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5636         for (i = 0; i < dpsoftrast.numthreads; i++)
5637         {
5638                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5639                 thread->index = i;
5640                 thread->cullface = GL_BACK;
5641         thread->colormask[0] = 1; 
5642                 thread->colormask[1] = 1;
5643                 thread->colormask[2] = 1;
5644                 thread->colormask[3] = 1;
5645                 thread->blendfunc[0] = GL_ONE;
5646                 thread->blendfunc[1] = GL_ZERO;
5647                 thread->depthmask = true;
5648                 thread->depthtest = true;
5649                 thread->depthfunc = GL_LEQUAL;
5650                 thread->scissortest = false;
5651                 thread->viewport[0] = 0;
5652                 thread->viewport[1] = 0;
5653                 thread->viewport[2] = dpsoftrast.fb_width;
5654                 thread->viewport[3] = dpsoftrast.fb_height;
5655                 thread->scissor[0] = 0;
5656                 thread->scissor[1] = 0;
5657                 thread->scissor[2] = dpsoftrast.fb_width;
5658                 thread->scissor[3] = dpsoftrast.fb_height;
5659                 thread->depthrange[0] = 0;
5660                 thread->depthrange[1] = 1;
5661                 thread->polygonoffset[0] = 0;
5662                 thread->polygonoffset[1] = 0;
5663                 thread->clipplane[0] = 0;
5664                 thread->clipplane[1] = 0;
5665                 thread->clipplane[2] = 0;
5666                 thread->clipplane[3] = 1;
5667         
5668                 thread->numspans = 0;
5669                 thread->numtriangles = 0;
5670                 thread->commandoffset = 0;
5671                 thread->waiting = false;
5672                 thread->starving = false;
5673            
5674                 thread->validate = -1;
5675                 DPSOFTRAST_Validate(thread, -1);
5676  
5677                 if (dpsoftrast.usethreads)
5678                 {
5679                         thread->waitcond = Thread_CreateCond();
5680                         thread->drawcond = Thread_CreateCond();
5681                         thread->drawmutex = Thread_CreateMutex();
5682                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5683                 }
5684         }
5685         return 0;
5686 }
5687
5688 void DPSOFTRAST_Shutdown(void)
5689 {
5690         int i;
5691         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5692         {
5693                 DPSOFTRAST_State_Thread *thread;
5694                 for (i = 0; i < dpsoftrast.numthreads; i++)
5695                 {
5696                         thread = &dpsoftrast.threads[i];
5697                         Thread_LockMutex(thread->drawmutex);
5698                         thread->index = -1;
5699                         Thread_CondSignal(thread->drawcond);
5700                         Thread_UnlockMutex(thread->drawmutex);
5701                         Thread_WaitThread(thread->thread, 0);
5702                         Thread_DestroyCond(thread->waitcond);
5703                         Thread_DestroyCond(thread->drawcond);
5704                         Thread_DestroyMutex(thread->drawmutex);
5705                 }
5706         }
5707         for (i = 0;i < dpsoftrast.texture_end;i++)
5708                 if (dpsoftrast.texture[i].bytes)
5709                         MM_FREE(dpsoftrast.texture[i].bytes);
5710         if (dpsoftrast.texture)
5711                 free(dpsoftrast.texture);
5712         if (dpsoftrast.threads)
5713                 MM_FREE(dpsoftrast.threads);
5714         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5715 }
5716