3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 #pragma warning(disable : 4324)
14 typedef qboolean bool;
21 #if defined(__APPLE__)
22 #include <libkern/OSAtomic.h>
23 #define ALIGN(var) var __attribute__((__aligned__(16)))
24 #define ATOMIC(var) var __attribute__((__aligned__(4)))
25 #define MEMORY_BARRIER (_mm_sfence())
26 #define ATOMIC_COUNTER volatile int32_t
27 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30 #elif defined(__GNUC__) && defined(WIN32)
31 #define ALIGN(var) var __attribute__((__aligned__(16)))
32 #define ATOMIC(var) var __attribute__((__aligned__(4)))
33 #define MEMORY_BARRIER (_mm_sfence())
34 //(__sync_synchronize())
35 #define ATOMIC_COUNTER volatile LONG
36 // this LONG * cast serves to fix an issue with broken mingw
37 // packages on Ubuntu; these only declare the function to take
38 // a LONG *, causing a compile error here. This seems to be
39 // error- and warn-free on platforms that DO declare
40 // InterlockedIncrement correctly, like mingw on Windows.
41 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement((LONG *) &(counter)))
42 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement((LONG *) &(counter)))
43 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd((LONG *) &(counter), (val)))
44 #elif defined(__GNUC__)
45 #define ALIGN(var) var __attribute__((__aligned__(16)))
46 #define ATOMIC(var) var __attribute__((__aligned__(4)))
47 #define MEMORY_BARRIER (_mm_sfence())
48 //(__sync_synchronize())
49 #define ATOMIC_COUNTER volatile int
50 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
51 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
52 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
53 #elif defined(_MSC_VER)
54 #define ALIGN(var) __declspec(align(16)) var
55 #define ATOMIC(var) __declspec(align(4)) var
56 #define MEMORY_BARRIER (_mm_sfence())
58 #define ATOMIC_COUNTER volatile LONG
59 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
60 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
61 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
66 #define ALIGN(var) var
69 #define ATOMIC(var) var
71 #ifndef MEMORY_BARRIER
72 #define MEMORY_BARRIER ((void)0)
74 #ifndef ATOMIC_COUNTER
75 #define ATOMIC_COUNTER int
77 #ifndef ATOMIC_INCREMENT
78 #define ATOMIC_INCREMENT(counter) (++(counter))
80 #ifndef ATOMIC_DECREMENT
81 #define ATOMIC_DECREMENT(counter) (--(counter))
84 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
88 #include <emmintrin.h>
90 #if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6) && !defined(__clang__)
91 #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
94 #define MM_MALLOC(size) _mm_malloc(size, ALIGN_SIZE)
96 static void *MM_CALLOC(size_t nmemb, size_t size)
98 void *ptr = _mm_malloc(nmemb*size, ALIGN_SIZE);
99 if (ptr != NULL) memset(ptr, 0, nmemb*size);
103 #define MM_FREE _mm_free
105 #define MM_MALLOC(size) malloc(size)
106 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
110 typedef enum DPSOFTRAST_ARRAY_e
112 DPSOFTRAST_ARRAY_POSITION,
113 DPSOFTRAST_ARRAY_COLOR,
114 DPSOFTRAST_ARRAY_TEXCOORD0,
115 DPSOFTRAST_ARRAY_TEXCOORD1,
116 DPSOFTRAST_ARRAY_TEXCOORD2,
117 DPSOFTRAST_ARRAY_TEXCOORD3,
118 DPSOFTRAST_ARRAY_TEXCOORD4,
119 DPSOFTRAST_ARRAY_TEXCOORD5,
120 DPSOFTRAST_ARRAY_TEXCOORD6,
121 DPSOFTRAST_ARRAY_TEXCOORD7,
122 DPSOFTRAST_ARRAY_TOTAL
126 typedef struct DPSOFTRAST_Texture_s
133 DPSOFTRAST_TEXTURE_FILTER filter;
136 ATOMIC_COUNTER binds;
137 unsigned char *bytes;
138 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
142 #define COMMAND_SIZE ALIGN_SIZE
143 #define COMMAND_ALIGN(var) ALIGN(var)
145 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
147 unsigned char opcode;
148 unsigned short commandsize;
152 enum { DPSOFTRAST_OPCODE_Reset = 0 };
154 #define DEFCOMMAND(opcodeval, name, fields) \
155 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
156 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
158 unsigned char opcode; \
159 unsigned short commandsize; \
161 } DPSOFTRAST_Command_##name );
163 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
164 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
166 typedef ALIGN(struct DPSOFTRAST_State_Command_Pool_s
170 ALIGN(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
172 DPSOFTRAST_State_Command_Pool);
174 typedef ALIGN(struct DPSOFTRAST_State_Triangle_s
176 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
178 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
180 DPSOFTRAST_State_Triangle);
182 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
183 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
184 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
185 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
186 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
188 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
189 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
190 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
191 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
192 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
193 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
194 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
195 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
196 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
199 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
201 typedef ALIGN(struct DPSOFTRAST_State_Span_s
203 int triangle; // triangle this span was generated by
204 int x; // framebuffer x coord
205 int y; // framebuffer y coord
206 int startx; // usable range (according to pixelmask)
207 int endx; // usable range (according to pixelmask)
208 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
209 int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
210 int depthslope; // depthbuffer value pixel delta
212 DPSOFTRAST_State_Span);
214 #define DPSOFTRAST_DRAW_MAXSPANS 1024
215 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
216 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
218 #define DPSOFTRAST_VALIDATE_FB 1
219 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
220 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
221 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
223 typedef enum DPSOFTRAST_BLENDMODE_e
225 DPSOFTRAST_BLENDMODE_OPAQUE,
226 DPSOFTRAST_BLENDMODE_ALPHA,
227 DPSOFTRAST_BLENDMODE_ADDALPHA,
228 DPSOFTRAST_BLENDMODE_ADD,
229 DPSOFTRAST_BLENDMODE_INVMOD,
230 DPSOFTRAST_BLENDMODE_MUL,
231 DPSOFTRAST_BLENDMODE_MUL2,
232 DPSOFTRAST_BLENDMODE_SUBALPHA,
233 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
234 DPSOFTRAST_BLENDMODE_INVADD,
235 DPSOFTRAST_BLENDMODE_TOTAL
237 DPSOFTRAST_BLENDMODE;
239 typedef ALIGN(struct DPSOFTRAST_State_Thread_s
255 float polygonoffset[2];
257 ALIGN(float fb_clipplane[4]);
260 int shader_permutation;
261 int shader_exactspecularmath;
263 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
265 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
266 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
268 // DPSOFTRAST_VALIDATE_ flags
271 // derived values (DPSOFTRAST_VALIDATE_FB)
274 ALIGN(float fb_viewportcenter[4]);
275 ALIGN(float fb_viewportscale[4]);
277 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
280 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
289 ATOMIC(volatile int commandoffset);
291 volatile bool waiting;
292 volatile bool starving;
299 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
300 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
301 unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
303 DPSOFTRAST_State_Thread);
305 typedef ALIGN(struct DPSOFTRAST_State_s
309 unsigned int *fb_depthpixels;
310 unsigned int *fb_colorpixels[4];
313 ALIGN(float fb_viewportcenter[4]);
314 ALIGN(float fb_viewportscale[4]);
317 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
318 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
320 const float *pointer_vertex3f;
321 const float *pointer_color4f;
322 const unsigned char *pointer_color4ub;
323 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
326 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
327 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
328 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
332 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
333 float *screencoord4f;
339 int shader_permutation;
340 int shader_exactspecularmath;
344 int texture_firstfree;
345 DPSOFTRAST_Texture *texture;
350 const char *errorstring;
355 DPSOFTRAST_State_Thread *threads;
357 ATOMIC(volatile int drawcommand);
359 DPSOFTRAST_State_Command_Pool commandpool;
363 DPSOFTRAST_State dpsoftrast;
365 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
366 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
367 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
368 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
370 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
371 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
373 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
375 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
376 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
377 fb_viewportcenter[3] = 0.5f;
378 fb_viewportcenter[0] = 0.0f;
379 fb_viewportscale[1] = 0.5f * viewport[2];
380 fb_viewportscale[2] = -0.5f * viewport[3];
381 fb_viewportscale[3] = 0.5f;
382 fb_viewportscale[0] = 1.0f;
385 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
387 if (dpsoftrast.interlace)
389 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
390 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
391 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
392 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
396 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
397 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
401 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
403 thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
404 thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
405 thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
406 thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
407 thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
410 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
412 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
413 // and viewport projection values
416 x1 = thread->scissor[0];
417 x2 = thread->scissor[0] + thread->scissor[2];
418 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
419 y2 = dpsoftrast.fb_height - thread->scissor[1];
420 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
422 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
424 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
425 thread->fb_scissor[0] = x1;
426 thread->fb_scissor[1] = y1;
427 thread->fb_scissor[2] = x2 - x1;
428 thread->fb_scissor[3] = y2 - y1;
430 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
431 DPSOFTRAST_RecalcClipPlane(thread);
432 DPSOFTRAST_RecalcThread(thread);
435 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
437 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
440 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
442 if (thread->blendsubtract)
444 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
446 #define BLENDFUNC(sfactor, dfactor, blendmode) \
447 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
448 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
449 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
454 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
456 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
457 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
458 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
459 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
460 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
461 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
462 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
463 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
464 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
465 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
466 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
471 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
473 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
475 mask &= thread->validate;
478 if (mask & DPSOFTRAST_VALIDATE_FB)
480 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
481 DPSOFTRAST_RecalcFB(thread);
483 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
485 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
486 DPSOFTRAST_RecalcDepthFunc(thread);
488 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
490 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
491 DPSOFTRAST_RecalcBlendFunc(thread);
495 static DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
497 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
498 return &dpsoftrast.texture[index];
502 static void DPSOFTRAST_Texture_Grow(void)
504 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
505 DPSOFTRAST_State_Thread *thread;
509 // expand texture array as needed
510 if (dpsoftrast.texture_max < 1024)
511 dpsoftrast.texture_max = 1024;
513 dpsoftrast.texture_max *= 2;
514 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
515 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
516 if (dpsoftrast.texbound[i])
517 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
518 for (j = 0; j < dpsoftrast.numthreads; j++)
520 thread = &dpsoftrast.threads[j];
521 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
522 if (thread->texbound[i])
523 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
527 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
536 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
537 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
538 DPSOFTRAST_Texture *texture;
539 if (width*height*depth < 1)
541 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
544 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
546 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
551 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
552 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
553 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
555 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
556 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
558 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
563 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
566 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
568 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
573 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
575 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
578 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
580 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
583 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
585 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
588 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
590 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
593 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
595 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
598 // find first empty slot in texture array
599 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
600 if (!dpsoftrast.texture[texnum].bytes)
602 dpsoftrast.texture_firstfree = texnum + 1;
603 if (dpsoftrast.texture_max <= texnum)
604 DPSOFTRAST_Texture_Grow();
605 if (dpsoftrast.texture_end <= texnum)
606 dpsoftrast.texture_end = texnum + 1;
607 texture = &dpsoftrast.texture[texnum];
608 memset(texture, 0, sizeof(*texture));
609 texture->flags = flags;
610 texture->width = width;
611 texture->height = height;
612 texture->depth = depth;
613 texture->sides = sides;
625 s = w * h * d * sides * 4;
626 texture->mipmap[mipmaps][0] = size;
627 texture->mipmap[mipmaps][1] = s;
628 texture->mipmap[mipmaps][2] = w;
629 texture->mipmap[mipmaps][3] = h;
630 texture->mipmap[mipmaps][4] = d;
633 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
639 texture->mipmaps = mipmaps;
640 texture->size = size;
642 // allocate the pixels now
643 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
647 void DPSOFTRAST_Texture_Free(int index)
649 DPSOFTRAST_Texture *texture;
650 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
654 MM_FREE(texture->bytes);
655 texture->bytes = NULL;
656 memset(texture, 0, sizeof(*texture));
657 // adjust the free range and used range
658 if (dpsoftrast.texture_firstfree > index)
659 dpsoftrast.texture_firstfree = index;
660 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
661 dpsoftrast.texture_end--;
663 static void DPSOFTRAST_Texture_CalculateMipmaps(int index)
665 int i, x, y, z, w, layer0, layer1, row0, row1;
666 unsigned char *o, *i0, *i1, *i2, *i3;
667 DPSOFTRAST_Texture *texture;
668 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
669 if (texture->mipmaps <= 1)
671 for (i = 1;i < texture->mipmaps;i++)
673 for (z = 0;z < texture->mipmap[i][4];z++)
677 if (layer1 >= texture->mipmap[i-1][4])
678 layer1 = texture->mipmap[i-1][4]-1;
679 for (y = 0;y < texture->mipmap[i][3];y++)
683 if (row1 >= texture->mipmap[i-1][3])
684 row1 = texture->mipmap[i-1][3]-1;
685 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
686 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
687 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
688 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
689 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
690 w = texture->mipmap[i][2];
693 if (texture->mipmap[i-1][2] > 1)
695 // average 3D texture
696 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
698 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
699 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
700 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
701 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
706 // average 3D mipmap with parent width == 1
707 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
709 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
710 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
711 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
712 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
718 if (texture->mipmap[i-1][2] > 1)
720 // average 2D texture (common case)
721 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
723 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
724 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
725 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
726 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
731 // 2D texture with parent width == 1
732 o[0] = (i0[0] + i1[0] + 1) >> 1;
733 o[1] = (i0[1] + i1[1] + 1) >> 1;
734 o[2] = (i0[2] + i1[2] + 1) >> 1;
735 o[3] = (i0[3] + i1[3] + 1) >> 1;
742 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
744 DPSOFTRAST_Texture *texture;
746 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
751 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
752 while (blockheight > 0)
754 memcpy(dst, pixels, blockwidth * 4);
755 pixels += blockwidth * 4;
756 dst += texture->mipmap[0][2] * 4;
760 DPSOFTRAST_Texture_CalculateMipmaps(index);
762 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
764 DPSOFTRAST_Texture *texture;
765 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
769 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
770 DPSOFTRAST_Texture_CalculateMipmaps(index);
772 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
774 DPSOFTRAST_Texture *texture;
775 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
776 return texture->mipmap[mip][2];
778 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
780 DPSOFTRAST_Texture *texture;
781 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
782 return texture->mipmap[mip][3];
784 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
786 DPSOFTRAST_Texture *texture;
787 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
788 return texture->mipmap[mip][4];
790 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
792 DPSOFTRAST_Texture *texture;
793 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
796 return texture->bytes + texture->mipmap[mip][0];
798 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
800 DPSOFTRAST_Texture *texture;
801 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
802 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
804 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
809 texture->filter = filter;
812 static void DPSOFTRAST_Draw_FlushThreads(void);
814 static void DPSOFTRAST_Draw_SyncCommands(void)
816 if(dpsoftrast.usethreads) MEMORY_BARRIER;
817 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
820 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
822 DPSOFTRAST_State_Thread *thread;
824 int freecommand = dpsoftrast.commandpool.freecommand;
825 int usedcommands = dpsoftrast.commandpool.usedcommands;
826 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
828 DPSOFTRAST_Draw_SyncCommands();
834 for (i = 0; i < dpsoftrast.numthreads; i++)
836 thread = &dpsoftrast.threads[i];
837 commandoffset = freecommand - thread->commandoffset;
838 if (commandoffset < 0)
839 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
840 if (commandoffset > usedcommands)
843 usedcommands = commandoffset;
846 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
848 thread = &dpsoftrast.threads[waitindex];
849 Thread_LockMutex(thread->drawmutex);
850 if (thread->commandoffset != dpsoftrast.drawcommand)
852 thread->waiting = true;
853 if (thread->starving) Thread_CondSignal(thread->drawcond);
854 Thread_CondWait(thread->waitcond, thread->drawmutex);
855 thread->waiting = false;
857 Thread_UnlockMutex(thread->drawmutex);
859 dpsoftrast.commandpool.usedcommands = usedcommands;
862 #define DPSOFTRAST_ALIGNCOMMAND(size) \
863 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
864 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
865 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
867 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
869 DPSOFTRAST_Command *command;
870 int freecommand = dpsoftrast.commandpool.freecommand;
871 int usedcommands = dpsoftrast.commandpool.usedcommands;
872 int extra = sizeof(DPSOFTRAST_Command);
873 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
874 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
875 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
877 if (dpsoftrast.usethreads)
878 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
880 DPSOFTRAST_Draw_FlushThreads();
881 freecommand = dpsoftrast.commandpool.freecommand;
882 usedcommands = dpsoftrast.commandpool.usedcommands;
884 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
886 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
887 command->opcode = DPSOFTRAST_OPCODE_Reset;
888 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
891 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
892 command->opcode = opcode;
893 command->commandsize = size;
895 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
897 dpsoftrast.commandpool.freecommand = freecommand;
898 dpsoftrast.commandpool.usedcommands = usedcommands + size;
902 static void DPSOFTRAST_UndoCommand(int size)
904 int freecommand = dpsoftrast.commandpool.freecommand;
905 int usedcommands = dpsoftrast.commandpool.usedcommands;
908 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
909 usedcommands -= size;
910 dpsoftrast.commandpool.freecommand = freecommand;
911 dpsoftrast.commandpool.usedcommands = usedcommands;
914 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
915 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
917 thread->viewport[0] = command->x;
918 thread->viewport[1] = command->y;
919 thread->viewport[2] = command->width;
920 thread->viewport[3] = command->height;
921 thread->validate |= DPSOFTRAST_VALIDATE_FB;
923 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
925 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
928 command->width = width;
929 command->height = height;
931 dpsoftrast.viewport[0] = x;
932 dpsoftrast.viewport[1] = y;
933 dpsoftrast.viewport[2] = width;
934 dpsoftrast.viewport[3] = height;
935 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
938 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
939 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
941 int i, x1, y1, x2, y2, w, h, x, y;
942 int miny1, maxy1, miny2, maxy2;
946 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
947 miny1 = thread->miny1;
948 maxy1 = thread->maxy1;
949 miny2 = thread->miny2;
950 maxy2 = thread->maxy2;
951 x1 = thread->fb_scissor[0];
952 y1 = thread->fb_scissor[1];
953 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
954 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
955 if (y1 < miny1) y1 = miny1;
956 if (y2 > maxy2) y2 = maxy2;
961 // FIXME: honor fb_colormask?
962 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
963 for (i = 0;i < 4;i++)
965 if (!dpsoftrast.fb_colorpixels[i])
967 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
970 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
971 for (x = x1;x < x2;x++)
976 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
978 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
985 DEFCOMMAND(3, ClearDepth, float depth;)
986 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
988 int x1, y1, x2, y2, w, h, x, y;
989 int miny1, maxy1, miny2, maxy2;
993 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
994 miny1 = thread->miny1;
995 maxy1 = thread->maxy1;
996 miny2 = thread->miny2;
997 maxy2 = thread->maxy2;
998 x1 = thread->fb_scissor[0];
999 y1 = thread->fb_scissor[1];
1000 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
1001 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
1002 if (y1 < miny1) y1 = miny1;
1003 if (y2 > maxy2) y2 = maxy2;
1008 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
1009 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
1010 for (;y < bandy;y++)
1012 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
1013 for (x = x1;x < x2;x++)
1017 void DPSOFTRAST_ClearDepth(float d)
1019 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1023 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1024 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1026 thread->colormask[0] = command->r != 0;
1027 thread->colormask[1] = command->g != 0;
1028 thread->colormask[2] = command->b != 0;
1029 thread->colormask[3] = command->a != 0;
1030 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1032 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1034 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1041 DEFCOMMAND(5, DepthTest, int enable;)
1042 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1044 thread->depthtest = command->enable;
1045 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1047 void DPSOFTRAST_DepthTest(int enable)
1049 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1050 command->enable = enable;
1053 DEFCOMMAND(6, ScissorTest, int enable;)
1054 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1056 thread->scissortest = command->enable;
1057 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1059 void DPSOFTRAST_ScissorTest(int enable)
1061 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1062 command->enable = enable;
1065 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1066 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1068 thread->scissor[0] = command->x;
1069 thread->scissor[1] = command->y;
1070 thread->scissor[2] = command->width;
1071 thread->scissor[3] = command->height;
1072 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1074 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1076 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1079 command->width = width;
1080 command->height = height;
1083 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1084 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1086 thread->blendfunc[0] = command->sfactor;
1087 thread->blendfunc[1] = command->dfactor;
1088 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1090 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1092 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1093 command->sfactor = sfactor;
1094 command->dfactor = dfactor;
1097 DEFCOMMAND(9, BlendSubtract, int enable;)
1098 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1100 thread->blendsubtract = command->enable;
1101 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1103 void DPSOFTRAST_BlendSubtract(int enable)
1105 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1106 command->enable = enable;
1109 DEFCOMMAND(10, DepthMask, int enable;)
1110 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1112 thread->depthmask = command->enable;
1114 void DPSOFTRAST_DepthMask(int enable)
1116 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1117 command->enable = enable;
1120 DEFCOMMAND(11, DepthFunc, int func;)
1121 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1123 thread->depthfunc = command->func;
1125 void DPSOFTRAST_DepthFunc(int func)
1127 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1128 command->func = func;
1131 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1132 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1134 thread->depthrange[0] = command->nearval;
1135 thread->depthrange[1] = command->farval;
1137 void DPSOFTRAST_DepthRange(float nearval, float farval)
1139 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1140 command->nearval = nearval;
1141 command->farval = farval;
1144 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1145 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1147 thread->polygonoffset[0] = command->alongnormal;
1148 thread->polygonoffset[1] = command->intoview;
1150 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1152 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1153 command->alongnormal = alongnormal;
1154 command->intoview = intoview;
1157 DEFCOMMAND(14, CullFace, int mode;)
1158 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1160 thread->cullface = command->mode;
1162 void DPSOFTRAST_CullFace(int mode)
1164 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1165 command->mode = mode;
1168 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1170 dpsoftrast.color[0] = r;
1171 dpsoftrast.color[1] = g;
1172 dpsoftrast.color[2] = b;
1173 dpsoftrast.color[3] = a;
1176 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1178 int outstride = blockwidth * 4;
1179 int instride = dpsoftrast.fb_width * 4;
1182 int bx2 = blockx + blockwidth;
1183 int by2 = blocky + blockheight;
1187 unsigned char *inpixels;
1191 if (bx1 < 0) bx1 = 0;
1192 if (by1 < 0) by1 = 0;
1193 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1194 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1196 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1197 if (dpsoftrast.bigendian)
1199 for (y = by1;y < by2;y++)
1201 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1202 o = (unsigned char *)outpixels + (y - by1) * outstride;
1203 for (x = bx1;x < bx2;x++)
1216 for (y = by1;y < by2;y++)
1218 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1219 o = (unsigned char *)outpixels + (y - by1) * outstride;
1225 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1229 int tx2 = tx + width;
1230 int ty2 = ty + height;
1233 int sx2 = sx + width;
1234 int sy2 = sy + height;
1244 unsigned int *spixels;
1245 unsigned int *tpixels;
1246 DPSOFTRAST_Texture *texture;
1247 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1248 if (mip < 0 || mip >= texture->mipmaps) return;
1250 spixels = dpsoftrast.fb_colorpixels[0];
1251 swidth = dpsoftrast.fb_width;
1252 sheight = dpsoftrast.fb_height;
1253 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1254 twidth = texture->mipmap[mip][2];
1255 theight = texture->mipmap[mip][3];
1256 if (tx1 < 0) tx1 = 0;
1257 if (ty1 < 0) ty1 = 0;
1258 if (tx2 > twidth) tx2 = twidth;
1259 if (ty2 > theight) ty2 = theight;
1260 if (sx1 < 0) sx1 = 0;
1261 if (sy1 < 0) sy1 = 0;
1262 if (sx2 > swidth) sx2 = swidth;
1263 if (sy2 > sheight) sy2 = sheight;
1268 if (tw > sw) tw = sw;
1269 if (th > sh) th = sh;
1270 if (tw < 1 || th < 1)
1272 sy1 = sheight - 1 - sy1;
1273 for (y = 0;y < th;y++)
1274 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1275 if (texture->mipmaps > 1)
1276 DPSOFTRAST_Texture_CalculateMipmaps(index);
1279 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1280 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1282 if (thread->texbound[command->unitnum])
1283 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1284 thread->texbound[command->unitnum] = command->texture;
1286 void DPSOFTRAST_SetTexture(int unitnum, int index)
1288 DPSOFTRAST_Command_SetTexture *command;
1289 DPSOFTRAST_Texture *texture;
1290 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1292 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1295 texture = DPSOFTRAST_Texture_GetByIndex(index);
1296 if (index && !texture)
1298 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1302 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1303 command->unitnum = unitnum;
1304 command->texture = texture;
1306 dpsoftrast.texbound[unitnum] = texture;
1307 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1310 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1312 dpsoftrast.pointer_vertex3f = vertex3f;
1313 dpsoftrast.stride_vertex = stride;
1315 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1317 dpsoftrast.pointer_color4f = color4f;
1318 dpsoftrast.pointer_color4ub = NULL;
1319 dpsoftrast.stride_color = stride;
1321 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1323 dpsoftrast.pointer_color4f = NULL;
1324 dpsoftrast.pointer_color4ub = color4ub;
1325 dpsoftrast.stride_color = stride;
1327 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1329 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1330 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1331 dpsoftrast.stride_texcoord[unitnum] = stride;
1334 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1335 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1337 thread->shader_mode = command->mode;
1338 thread->shader_permutation = command->permutation;
1339 thread->shader_exactspecularmath = command->exactspecularmath;
1341 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1343 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1344 command->mode = mode;
1345 command->permutation = permutation;
1346 command->exactspecularmath = exactspecularmath;
1348 dpsoftrast.shader_mode = mode;
1349 dpsoftrast.shader_permutation = permutation;
1350 dpsoftrast.shader_exactspecularmath = exactspecularmath;
1353 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1354 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1356 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1358 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1360 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1361 command->index = index;
1362 command->val[0] = v0;
1363 command->val[1] = v1;
1364 command->val[2] = v2;
1365 command->val[3] = v3;
1367 dpsoftrast.uniform4f[index*4+0] = v0;
1368 dpsoftrast.uniform4f[index*4+1] = v1;
1369 dpsoftrast.uniform4f[index*4+2] = v2;
1370 dpsoftrast.uniform4f[index*4+3] = v3;
1372 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1374 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1375 command->index = index;
1376 memcpy(command->val, v, sizeof(command->val));
1378 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1381 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1382 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1384 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1386 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1390 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1392 __m128 m0, m1, m2, m3;
1393 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1394 command->index = (DPSOFTRAST_UNIFORM)index;
1395 if (((size_t)v)&(ALIGN_SIZE-1))
1397 m0 = _mm_loadu_ps(v);
1398 m1 = _mm_loadu_ps(v+4);
1399 m2 = _mm_loadu_ps(v+8);
1400 m3 = _mm_loadu_ps(v+12);
1404 m0 = _mm_load_ps(v);
1405 m1 = _mm_load_ps(v+4);
1406 m2 = _mm_load_ps(v+8);
1407 m3 = _mm_load_ps(v+12);
1411 __m128 t0, t1, t2, t3;
1412 t0 = _mm_unpacklo_ps(m0, m1);
1413 t1 = _mm_unpacklo_ps(m2, m3);
1414 t2 = _mm_unpackhi_ps(m0, m1);
1415 t3 = _mm_unpackhi_ps(m2, m3);
1416 m0 = _mm_movelh_ps(t0, t1);
1417 m1 = _mm_movehl_ps(t1, t0);
1418 m2 = _mm_movelh_ps(t2, t3);
1419 m3 = _mm_movehl_ps(t3, t2);
1421 _mm_store_ps(command->val, m0);
1422 _mm_store_ps(command->val+4, m1);
1423 _mm_store_ps(command->val+8, m2);
1424 _mm_store_ps(command->val+12, m3);
1425 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1426 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1427 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1428 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1433 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1434 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1436 thread->uniform1i[command->index] = command->val;
1438 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1440 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1441 command->index = index;
1444 dpsoftrast.uniform1i[command->index] = i0;
1447 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1448 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1450 memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1451 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1453 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1455 DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1456 command->clipplane[0] = x;
1457 command->clipplane[1] = y;
1458 command->clipplane[2] = z;
1459 command->clipplane[3] = w;
1463 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1465 float *end = dst + size*4;
1466 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1470 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1479 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1486 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1488 float *end = dst + size*4;
1489 if (stride == sizeof(float[3]))
1491 float *end4 = dst + (size&~3)*4;
1492 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1496 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1497 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1498 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1499 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1500 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1501 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1502 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1503 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1504 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1505 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1506 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1507 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1508 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1510 src += 4*sizeof(float[3]);
1517 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1518 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1519 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1520 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1521 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1522 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1523 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1524 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1525 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1526 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1527 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1528 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1529 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1531 src += 4*sizeof(float[3]);
1535 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1539 __m128 v = _mm_loadu_ps((const float *)src);
1540 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1541 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1542 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1543 _mm_store_ps(dst, v);
1552 __m128 v = _mm_load_ps((const float *)src);
1553 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1554 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1555 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1556 _mm_store_ps(dst, v);
1563 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1565 float *end = dst + size*4;
1566 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1567 if (stride == sizeof(float[2]))
1569 float *end2 = dst + (size&~1)*4;
1570 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1574 __m128 v = _mm_loadu_ps((const float *)src);
1575 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1576 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1578 src += 2*sizeof(float[2]);
1585 __m128 v = _mm_load_ps((const float *)src);
1586 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1587 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1589 src += 2*sizeof(float[2]);
1595 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1601 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1603 float *end = dst + size*4;
1604 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1605 if (stride == sizeof(unsigned char[4]))
1607 float *end4 = dst + (size&~3)*4;
1608 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1612 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1613 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1614 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1615 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1616 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1618 src += 4*sizeof(unsigned char[4]);
1625 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1626 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1627 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1628 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1629 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1631 src += 4*sizeof(unsigned char[4]);
1637 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1638 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1644 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1646 float *end = dst + 4*size;
1647 __m128 v = _mm_loadu_ps(src);
1650 _mm_store_ps(dst, v);
1656 static void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1659 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1660 __m128 m0, m1, m2, m3;
1662 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1664 // fast case for identity matrix
1665 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1668 end = out4f + numitems*4;
1669 m0 = _mm_loadu_ps(inmatrix16f);
1670 m1 = _mm_loadu_ps(inmatrix16f + 4);
1671 m2 = _mm_loadu_ps(inmatrix16f + 8);
1672 m3 = _mm_loadu_ps(inmatrix16f + 12);
1673 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1677 __m128 v = _mm_loadu_ps(in4f);
1679 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1680 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1681 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1682 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1691 __m128 v = _mm_load_ps(in4f);
1693 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1694 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1695 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1696 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1705 static void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1707 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1712 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1714 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1715 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1716 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1717 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1720 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1722 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1723 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1724 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1725 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1728 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1731 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1732 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1733 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1734 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1737 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1739 int clipmask = 0xFF;
1740 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1741 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1742 __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1743 __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1744 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1745 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1746 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1747 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1748 #define BBFRONT(k, pos) \
1750 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1751 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1752 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1755 clipmask &= ~(1<<k); \
1756 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1757 minproj = _mm_min_ss(minproj, proj); \
1758 maxproj = _mm_max_ss(maxproj, proj); \
1762 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1763 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1764 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1765 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1766 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1767 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1771 if (clipmask&(1<<k)) \
1773 if (!(clipmask&(1<<(k^1)))) \
1775 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1776 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1777 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1778 minproj = _mm_min_ss(minproj, proj); \
1779 maxproj = _mm_max_ss(maxproj, proj); \
1781 if (!(clipmask&(1<<(k^2)))) \
1783 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1784 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1785 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1786 minproj = _mm_min_ss(minproj, proj); \
1787 maxproj = _mm_max_ss(maxproj, proj); \
1789 if (!(clipmask&(1<<(k^4)))) \
1791 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1792 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1793 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1794 minproj = _mm_min_ss(minproj, proj); \
1795 maxproj = _mm_max_ss(maxproj, proj); \
1799 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1800 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1801 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1802 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1803 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1804 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1805 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1806 *starty = _mm_cvttss_si32(maxproj);
1807 *endy = _mm_cvttss_si32(minproj)+1;
1811 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1813 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1814 float *end = out4f + numitems*4;
1815 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1816 __m128 minpos, maxpos;
1817 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1819 minpos = maxpos = _mm_loadu_ps(in4f);
1822 __m128 v = _mm_loadu_ps(in4f);
1823 minpos = _mm_min_ps(minpos, v);
1824 maxpos = _mm_max_ps(maxpos, v);
1825 _mm_store_ps(out4f, v);
1826 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1827 _mm_store_ps(screen4f, v);
1835 minpos = maxpos = _mm_load_ps(in4f);
1838 __m128 v = _mm_load_ps(in4f);
1839 minpos = _mm_min_ps(minpos, v);
1840 maxpos = _mm_max_ps(maxpos, v);
1841 _mm_store_ps(out4f, v);
1842 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1843 _mm_store_ps(screen4f, v);
1851 ALIGN(float minposf[4]);
1852 ALIGN(float maxposf[4]);
1853 _mm_store_ps(minposf, minpos);
1854 _mm_store_ps(maxposf, maxpos);
1855 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1860 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1862 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1863 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1865 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1866 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1867 end = out4f + numitems*4;
1868 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1869 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1870 m0 = _mm_loadu_ps(inmatrix16f);
1871 m1 = _mm_loadu_ps(inmatrix16f + 4);
1872 m2 = _mm_loadu_ps(inmatrix16f + 8);
1873 m3 = _mm_loadu_ps(inmatrix16f + 12);
1874 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1876 minpos = maxpos = _mm_loadu_ps(in4f);
1879 __m128 v = _mm_loadu_ps(in4f);
1880 minpos = _mm_min_ps(minpos, v);
1881 maxpos = _mm_max_ps(maxpos, v);
1882 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1883 _mm_store_ps(out4f, v);
1884 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1885 _mm_store_ps(screen4f, v);
1893 minpos = maxpos = _mm_load_ps(in4f);
1896 __m128 v = _mm_load_ps(in4f);
1897 minpos = _mm_min_ps(minpos, v);
1898 maxpos = _mm_max_ps(maxpos, v);
1899 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1900 _mm_store_ps(out4f, v);
1901 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1902 _mm_store_ps(screen4f, v);
1910 ALIGN(float minposf[4]);
1911 ALIGN(float maxposf[4]);
1912 _mm_store_ps(minposf, minpos);
1913 _mm_store_ps(maxposf, maxpos);
1914 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f);
1920 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1923 float *outf = dpsoftrast.post_array4f[outarray];
1924 const unsigned char *inb;
1925 int firstvertex = dpsoftrast.firstvertex;
1926 int numvertices = dpsoftrast.numvertices;
1930 case DPSOFTRAST_ARRAY_POSITION:
1931 stride = dpsoftrast.stride_vertex;
1932 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1933 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1935 case DPSOFTRAST_ARRAY_COLOR:
1936 stride = dpsoftrast.stride_color;
1937 if (dpsoftrast.pointer_color4f)
1939 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1940 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1942 else if (dpsoftrast.pointer_color4ub)
1944 stride = dpsoftrast.stride_color;
1945 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1946 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1950 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1954 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1955 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1957 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1958 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1961 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1964 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1967 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1979 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1981 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1982 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1987 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1990 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1991 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1999 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2002 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2003 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2010 static void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2013 int startx = span->startx;
2014 int endx = span->endx;
2015 float wslope = triangle->w[0];
2016 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2017 float endz = 1.0f / (w + wslope * startx);
2018 if (triangle->w[0] == 0)
2020 // LordHavoc: fast flat polygons (HUD/menu)
2021 for (x = startx;x < endx;x++)
2025 for (x = startx;x < endx;)
2027 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2029 if (nextsub >= endx) nextsub = endsub = endx-1;
2030 endz = 1.0f / (w + wslope * nextsub);
2031 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2032 for (; x <= endsub; x++, z += dz)
2037 static void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2041 int startx = span->startx;
2042 int endx = span->endx;
2045 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2046 unsigned char * RESTRICT pixelmask = span->pixelmask;
2047 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2048 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2051 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2052 pixeli += span->y * dpsoftrast.fb_width + span->x;
2053 // handle alphatest now (this affects depth writes too)
2054 if (thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL)
2055 for (x = startx;x < endx;x++)
2056 if (in4ub[x*4+3] < 128)
2057 pixelmask[x] = false;
2058 // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2059 // helps sprites, text and hud artwork
2060 switch(thread->fb_blendmode)
2062 case DPSOFTRAST_BLENDMODE_ALPHA:
2063 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2064 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2066 for (x = startx;x < endx;x++)
2068 if (in4ub[x*4+3] >= 1)
2073 while (++x < endx && in4ub[x*4+3] >= 1) ;
2075 if (x >= endx) break;
2077 while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2078 if (x >= endx) break;
2085 case DPSOFTRAST_BLENDMODE_OPAQUE:
2086 case DPSOFTRAST_BLENDMODE_ADD:
2087 case DPSOFTRAST_BLENDMODE_INVMOD:
2088 case DPSOFTRAST_BLENDMODE_MUL:
2089 case DPSOFTRAST_BLENDMODE_MUL2:
2090 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2091 case DPSOFTRAST_BLENDMODE_INVADD:
2094 // put some special values at the end of the mask to ensure the loops end
2095 pixelmask[endx] = 1;
2096 pixelmask[endx+1] = 0;
2097 // LordHavoc: use a double loop to identify subspans, this helps the
2098 // optimized copy/blend loops to perform at their best, most triangles
2099 // have only one run of pixels, and do the search using wide reads...
2103 // if this pixel is masked off, it's probably not alone...
2110 // the 4-item search must be aligned or else it stalls badly
2111 if ((x & 3) && !pixelmask[x])
2113 if(pixelmask[x]) goto endmasked;
2117 if(pixelmask[x]) goto endmasked;
2121 if(pixelmask[x]) goto endmasked;
2126 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2130 for (;!pixelmask[x];x++)
2132 // rather than continue the loop, just check the end variable
2137 // find length of subspan
2140 if (subx + 8 < endx)
2144 if(!pixelmask[subx]) goto endunmasked;
2148 if(!pixelmask[subx]) goto endunmasked;
2152 if(!pixelmask[subx]) goto endunmasked;
2157 while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2161 for (;pixelmask[subx];subx++)
2163 // the checks can overshoot, so make sure to clip it...
2167 // now that we know the subspan length... process!
2168 switch(thread->fb_blendmode)
2170 case DPSOFTRAST_BLENDMODE_OPAQUE:
2174 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2179 while (x + 16 <= subx)
2181 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2182 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2183 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2184 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2189 while (x + 4 <= subx)
2191 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2197 pixeli[x+1] = ini[x+1];
2207 case DPSOFTRAST_BLENDMODE_ALPHA:
2208 #define FINISHBLEND(blend2, blend1) \
2209 for (;x + 1 < subx;x += 2) \
2212 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2213 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2215 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2220 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2221 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2223 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2227 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2228 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2230 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2231 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2234 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2236 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2237 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2239 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2240 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2243 case DPSOFTRAST_BLENDMODE_ADD:
2244 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2246 case DPSOFTRAST_BLENDMODE_INVMOD:
2248 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2250 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2253 case DPSOFTRAST_BLENDMODE_MUL:
2254 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2256 case DPSOFTRAST_BLENDMODE_MUL2:
2257 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2259 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2261 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2262 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2264 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2265 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2268 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2270 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2271 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2273 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2274 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2277 case DPSOFTRAST_BLENDMODE_INVADD:
2279 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2281 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2289 static void DPSOFTRAST_Texture2DBGRA8(DPSOFTRAST_Texture *texture, int mip, float x, float y, unsigned char c[4])
2290 // warning: this is SLOW, only use if the optimized per-span functions won't do
2292 const unsigned char * RESTRICT pixelbase;
2293 const unsigned char * RESTRICT pixel[4];
2294 int width = texture->mipmap[mip][2], height = texture->mipmap[mip][3];
2295 int wrapmask[2] = { width-1, height-1 };
2296 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2297 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
2299 unsigned int tc[2] = { x * (width<<12) - 2048, y * (height<<12) - 2048};
2300 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2301 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2302 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2303 int tci[2] = { tc[0]>>12, tc[1]>>12 };
2304 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2305 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2307 tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2308 tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2309 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= wrapmask[0] ? tci1[0] : wrapmask[0]) : 0;
2310 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= wrapmask[1] ? tci1[1] : wrapmask[1]) : 0;
2314 tci[0] &= wrapmask[0];
2315 tci[1] &= wrapmask[1];
2316 tci1[0] &= wrapmask[0];
2317 tci1[1] &= wrapmask[1];
2319 pixel[0] = pixelbase + 4 * (tci[1]*width+tci[0]);
2320 pixel[1] = pixelbase + 4 * (tci[1]*width+tci1[0]);
2321 pixel[2] = pixelbase + 4 * (tci1[1]*width+tci[0]);
2322 pixel[3] = pixelbase + 4 * (tci1[1]*width+tci1[0]);
2323 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
2324 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
2325 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
2326 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
2330 int tci[2] = { x * width, y * height };
2331 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2333 tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2334 tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2338 tci[0] &= wrapmask[0];
2339 tci[1] &= wrapmask[1];
2341 pixel[0] = pixelbase + 4 * (tci[1]*width+tci[0]);
2350 static void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2353 int startx = span->startx;
2354 int endx = span->endx;
2359 float tc[2], endtc[2];
2361 unsigned int tci[2];
2362 unsigned int tci1[2];
2363 unsigned int tcimin[2];
2364 unsigned int tcimax[2];
2369 const unsigned char * RESTRICT pixelbase;
2370 const unsigned char * RESTRICT pixel[4];
2371 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2372 // if no texture is bound, just fill it with white
2375 for (x = startx;x < endx;x++)
2377 out4f[x*4+0] = 1.0f;
2378 out4f[x*4+1] = 1.0f;
2379 out4f[x*4+2] = 1.0f;
2380 out4f[x*4+3] = 1.0f;
2384 mip = triangle->mip[texunitindex];
2385 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2386 // if this mipmap of the texture is 1 pixel, just fill it with that color
2387 if (texture->mipmap[mip][1] == 4)
2389 c[0] = texture->bytes[2] * (1.0f/255.0f);
2390 c[1] = texture->bytes[1] * (1.0f/255.0f);
2391 c[2] = texture->bytes[0] * (1.0f/255.0f);
2392 c[3] = texture->bytes[3] * (1.0f/255.0f);
2393 for (x = startx;x < endx;x++)
2395 out4f[x*4+0] = c[0];
2396 out4f[x*4+1] = c[1];
2397 out4f[x*4+2] = c[2];
2398 out4f[x*4+3] = c[3];
2402 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2403 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2404 flags = texture->flags;
2405 tcscale[0] = texture->mipmap[mip][2];
2406 tcscale[1] = texture->mipmap[mip][3];
2407 tciwidth = texture->mipmap[mip][2];
2410 tcimax[0] = texture->mipmap[mip][2]-1;
2411 tcimax[1] = texture->mipmap[mip][3]-1;
2412 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2413 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2414 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2415 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2421 for (x = startx;x < endx;)
2423 unsigned int subtc[2];
2424 unsigned int substep[2];
2425 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2426 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2427 if (nextsub >= endx)
2429 nextsub = endsub = endx-1;
2430 if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2434 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2435 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2441 substep[0] = (endtc[0] - tc[0]) * subscale;
2442 substep[1] = (endtc[1] - tc[1]) * subscale;
2443 subtc[0] = tc[0] * (1<<12);
2444 subtc[1] = tc[1] * (1<<12);
2447 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2449 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2451 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2452 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2453 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2454 tci[0] = subtc[0]>>12;
2455 tci[1] = subtc[1]>>12;
2456 tci1[0] = tci[0] + 1;
2457 tci1[1] = tci[1] + 1;
2458 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2459 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2460 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2461 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2462 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2463 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2464 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2465 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2466 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2467 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2468 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2469 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2470 out4f[x*4+0] = c[0];
2471 out4f[x*4+1] = c[1];
2472 out4f[x*4+2] = c[2];
2473 out4f[x*4+3] = c[3];
2478 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2480 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2481 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2482 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2483 tci[0] = subtc[0]>>12;
2484 tci[1] = subtc[1]>>12;
2485 tci1[0] = tci[0] + 1;
2486 tci1[1] = tci[1] + 1;
2487 tci[0] &= tciwrapmask[0];
2488 tci[1] &= tciwrapmask[1];
2489 tci1[0] &= tciwrapmask[0];
2490 tci1[1] &= tciwrapmask[1];
2491 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2492 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2493 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2494 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2495 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2496 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2497 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2498 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2499 out4f[x*4+0] = c[0];
2500 out4f[x*4+1] = c[1];
2501 out4f[x*4+2] = c[2];
2502 out4f[x*4+3] = c[3];
2506 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2508 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2510 tci[0] = subtc[0]>>12;
2511 tci[1] = subtc[1]>>12;
2512 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2513 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2514 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2515 c[0] = pixel[0][2] * (1.0f / 255.0f);
2516 c[1] = pixel[0][1] * (1.0f / 255.0f);
2517 c[2] = pixel[0][0] * (1.0f / 255.0f);
2518 c[3] = pixel[0][3] * (1.0f / 255.0f);
2519 out4f[x*4+0] = c[0];
2520 out4f[x*4+1] = c[1];
2521 out4f[x*4+2] = c[2];
2522 out4f[x*4+3] = c[3];
2527 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2529 tci[0] = subtc[0]>>12;
2530 tci[1] = subtc[1]>>12;
2531 tci[0] &= tciwrapmask[0];
2532 tci[1] &= tciwrapmask[1];
2533 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2534 c[0] = pixel[0][2] * (1.0f / 255.0f);
2535 c[1] = pixel[0][1] * (1.0f / 255.0f);
2536 c[2] = pixel[0][0] * (1.0f / 255.0f);
2537 c[3] = pixel[0][3] * (1.0f / 255.0f);
2538 out4f[x*4+0] = c[0];
2539 out4f[x*4+1] = c[1];
2540 out4f[x*4+2] = c[2];
2541 out4f[x*4+3] = c[3];
2548 static void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2552 int startx = span->startx;
2553 int endx = span->endx;
2555 __m128 data, slope, tcscale;
2556 __m128i tcsize, tcmask, tcoffset, tcmax;
2558 __m128i subtc, substep, endsubtc;
2561 int affine; // LordHavoc: optimized affine texturing case
2562 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2563 const unsigned char * RESTRICT pixelbase;
2564 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2565 // if no texture is bound, just fill it with white
2568 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2571 mip = triangle->mip[texunitindex];
2572 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2573 // if this mipmap of the texture is 1 pixel, just fill it with that color
2574 if (texture->mipmap[mip][1] == 4)
2576 unsigned int k = *((const unsigned int *)pixelbase);
2577 for (x = startx;x < endx;x++)
2581 affine = zf[startx] == zf[endx-1];
2582 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2583 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2584 flags = texture->flags;
2585 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2586 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2587 tcscale = _mm_cvtepi32_ps(tcsize);
2588 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2589 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2590 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2592 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2593 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2594 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2595 tcmax = _mm_packs_epi32(tcmask, tcmask);
2596 for (x = startx;x < endx;)
2598 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2599 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2600 if (nextsub >= endx || affine)
2602 nextsub = endsub = endx-1;
2603 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2607 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2609 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2610 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2611 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2612 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2613 substep = _mm_slli_epi32(substep, 1);
2616 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2617 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2619 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2620 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2622 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2623 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2624 tci = _mm_madd_epi16(tci, tcoffset);
2625 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2626 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2627 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2628 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2629 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2630 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2631 fracm = _mm_srli_epi16(subtc, 1);
2632 pix1 = _mm_add_epi16(pix1,
2633 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2634 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2635 pix3 = _mm_add_epi16(pix3,
2636 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2637 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2638 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2639 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2640 pix2 = _mm_add_epi16(pix2,
2641 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2642 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2643 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2647 const unsigned char * RESTRICT ptr1;
2648 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2649 tci = _mm_madd_epi16(tci, tcoffset);
2650 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2651 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2652 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2653 fracm = _mm_srli_epi16(subtc, 1);
2654 pix1 = _mm_add_epi16(pix1,
2655 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2656 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2657 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2658 pix1 = _mm_add_epi16(pix1,
2659 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2660 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2661 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2665 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2667 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2669 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2670 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2671 tci = _mm_madd_epi16(tci, tcoffset);
2672 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2673 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2674 _mm_setzero_si128());
2675 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2676 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2677 _mm_setzero_si128());
2678 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2679 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2680 tci = _mm_madd_epi16(tci, tcoffset);
2681 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2682 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2683 _mm_setzero_si128());
2684 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2685 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2686 _mm_setzero_si128());
2687 fracm = _mm_srli_epi16(subtc, 1);
2688 pix1 = _mm_add_epi16(pix1,
2689 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2690 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2691 pix3 = _mm_add_epi16(pix3,
2692 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2693 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2694 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2695 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2696 pix2 = _mm_add_epi16(pix2,
2697 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2698 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2699 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2703 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2704 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2705 tci = _mm_madd_epi16(tci, tcoffset);
2706 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2707 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2708 _mm_setzero_si128());
2709 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2710 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2711 _mm_setzero_si128());
2712 fracm = _mm_srli_epi16(subtc, 1);
2713 pix1 = _mm_add_epi16(pix1,
2714 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2715 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2716 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2717 pix1 = _mm_add_epi16(pix1,
2718 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2719 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2720 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2726 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2728 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2729 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2730 tci = _mm_madd_epi16(tci, tcoffset);
2731 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2732 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2733 _mm_setzero_si128());
2734 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2735 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2736 _mm_setzero_si128());
2737 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2738 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2739 tci = _mm_madd_epi16(tci, tcoffset);
2740 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2741 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2742 _mm_setzero_si128());
2743 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2744 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2745 _mm_setzero_si128());
2746 fracm = _mm_srli_epi16(subtc, 1);
2747 pix1 = _mm_add_epi16(pix1,
2748 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2749 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2750 pix3 = _mm_add_epi16(pix3,
2751 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2752 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2753 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2754 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2755 pix2 = _mm_add_epi16(pix2,
2756 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2757 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2758 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2762 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2763 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2764 tci = _mm_madd_epi16(tci, tcoffset);
2765 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2766 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2767 _mm_setzero_si128());
2768 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2769 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2770 _mm_setzero_si128());
2771 fracm = _mm_srli_epi16(subtc, 1);
2772 pix1 = _mm_add_epi16(pix1,
2773 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2774 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2775 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2776 pix1 = _mm_add_epi16(pix1,
2777 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2778 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2779 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2786 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2788 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2790 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2791 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2792 tci = _mm_madd_epi16(tci, tcoffset);
2793 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2794 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2798 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2799 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2800 tci = _mm_madd_epi16(tci, tcoffset);
2801 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2807 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2809 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2810 tci = _mm_and_si128(tci, tcmax);
2811 tci = _mm_madd_epi16(tci, tcoffset);
2812 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2813 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2817 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2818 tci = _mm_and_si128(tci, tcmax);
2819 tci = _mm_madd_epi16(tci, tcoffset);
2820 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2829 static void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2832 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2835 static float DPSOFTRAST_SampleShadowmap(const float *vector)
2842 static void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2845 int startx = span->startx;
2846 int endx = span->endx;
2851 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2852 for (x = startx;x < endx;x++)
2855 c[0] = (data[0] + slope[0]*x) * z;
2856 c[1] = (data[1] + slope[1]*x) * z;
2857 c[2] = (data[2] + slope[2]*x) * z;
2858 c[3] = (data[3] + slope[3]*x) * z;
2859 out4f[x*4+0] = in4f[x*4+0] * c[0];
2860 out4f[x*4+1] = in4f[x*4+1] * c[1];
2861 out4f[x*4+2] = in4f[x*4+2] * c[2];
2862 out4f[x*4+3] = in4f[x*4+3] * c[3];
2868 static void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2871 int startx = span->startx;
2872 int endx = span->endx;
2877 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2878 for (x = startx;x < endx;x++)
2881 c[0] = (data[0] + slope[0]*x) * z;
2882 c[1] = (data[1] + slope[1]*x) * z;
2883 c[2] = (data[2] + slope[2]*x) * z;
2884 c[3] = (data[3] + slope[3]*x) * z;
2885 out4f[x*4+0] = c[0];
2886 out4f[x*4+1] = c[1];
2887 out4f[x*4+2] = c[2];
2888 out4f[x*4+3] = c[3];
2894 static void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2896 int x, startx = span->startx, endx = span->endx;
2897 float c[4], localcolor[4];
2898 localcolor[0] = subcolor[0];
2899 localcolor[1] = subcolor[1];
2900 localcolor[2] = subcolor[2];
2901 localcolor[3] = subcolor[3];
2902 for (x = startx;x < endx;x++)
2904 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2905 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2906 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2907 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2908 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2909 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2910 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2911 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2917 static void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2919 int x, startx = span->startx, endx = span->endx;
2920 for (x = startx;x < endx;x++)
2922 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2923 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2924 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2925 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2931 static void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2933 int x, startx = span->startx, endx = span->endx;
2934 for (x = startx;x < endx;x++)
2936 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2937 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2938 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2939 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2945 static void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2947 int x, startx = span->startx, endx = span->endx;
2949 for (x = startx;x < endx;x++)
2951 a = 1.0f - inb4f[x*4+3];
2953 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2954 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2955 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2956 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2962 static void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2964 int x, startx = span->startx, endx = span->endx;
2965 float localcolor[4], ilerp, lerp;
2966 localcolor[0] = color[0];
2967 localcolor[1] = color[1];
2968 localcolor[2] = color[2];
2969 localcolor[3] = color[3];
2970 ilerp = 1.0f - localcolor[3];
2971 lerp = localcolor[3];
2972 for (x = startx;x < endx;x++)
2974 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2975 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2976 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2977 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2984 static void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2988 int startx = span->startx;
2989 int endx = span->endx;
2992 __m128i submod, substep, endsubmod;
2993 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2994 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2995 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2996 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2997 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2998 for (x = startx; x < endx;)
3000 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3001 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3002 if (nextsub >= endx)
3004 nextsub = endsub = endx-1;
3005 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
3009 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3010 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3011 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3012 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3013 substep = _mm_packs_epi32(substep, substep);
3014 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3016 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3017 pix = _mm_mulhi_epu16(pix, submod);
3018 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3022 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3023 pix = _mm_mulhi_epu16(pix, submod);
3024 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3031 static void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3035 int startx = span->startx;
3036 int endx = span->endx;
3039 __m128i submod, substep, endsubmod;
3040 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3041 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3042 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3043 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3044 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3045 for (x = startx; x < endx;)
3047 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3048 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3049 if (nextsub >= endx)
3051 nextsub = endsub = endx-1;
3052 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3056 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3057 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3058 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3059 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3060 substep = _mm_packs_epi32(substep, substep);
3061 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3063 __m128i pix = _mm_srai_epi16(submod, 4);
3064 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3068 __m128i pix = _mm_srai_epi16(submod, 4);
3069 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3076 static void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3079 int x, startx = span->startx, endx = span->endx;
3080 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3081 localcolor = _mm_packs_epi32(localcolor, localcolor);
3082 for (x = startx;x+2 <= endx;x+=2)
3084 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3085 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3086 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3087 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3091 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3092 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3093 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3094 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3099 static void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3102 int x, startx = span->startx, endx = span->endx;
3103 for (x = startx;x+2 <= endx;x+=2)
3105 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3106 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3107 pix1 = _mm_mulhi_epu16(pix1, pix2);
3108 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3112 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3113 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3114 pix1 = _mm_mulhi_epu16(pix1, pix2);
3115 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3120 static void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3123 int x, startx = span->startx, endx = span->endx;
3124 for (x = startx;x+2 <= endx;x+=2)
3126 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3127 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3128 pix1 = _mm_add_epi16(pix1, pix2);
3129 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3133 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3134 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3135 pix1 = _mm_add_epi16(pix1, pix2);
3136 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3142 static void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3145 int x, startx = span->startx, endx = span->endx;
3146 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3147 tint = _mm_packs_epi32(tint, tint);
3148 for (x = startx;x+2 <= endx;x+=2)
3150 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3151 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3152 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3153 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3157 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3158 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3159 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3160 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3166 static void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3169 int x, startx = span->startx, endx = span->endx;
3170 for (x = startx;x+2 <= endx;x+=2)
3172 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3173 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3174 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3175 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3176 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3180 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3181 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3182 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3183 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3184 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3189 static void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3192 int x, startx = span->startx, endx = span->endx;
3193 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3194 localcolor = _mm_packs_epi32(localcolor, localcolor);
3195 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3196 for (x = startx;x+2 <= endx;x+=2)
3198 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3199 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3200 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3204 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3205 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3206 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3213 static void DPSOFTRAST_VertexShader_Generic(void)
3215 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3216 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3217 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3218 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3219 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3222 static void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3224 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3225 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3226 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3227 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3228 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3229 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3231 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3232 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3233 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3235 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3236 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3239 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3241 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3244 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3246 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3249 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3254 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3255 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3260 static void DPSOFTRAST_VertexShader_PostProcess(void)
3262 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3263 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3264 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3267 static void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3269 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3270 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3271 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3272 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3273 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3274 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3275 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3277 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3278 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3280 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3281 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3283 // TODO: implement saturation
3285 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3287 // TODO: implement gammaramps
3289 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3294 static void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3296 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3299 static void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3301 // this is never called (because colormask is off when this shader is used)
3302 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3303 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3304 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3305 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3306 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3311 static void DPSOFTRAST_VertexShader_FlatColor(void)
3313 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3314 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3317 static void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3320 unsigned char * RESTRICT pixelmask = span->pixelmask;
3321 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3322 int x, startx = span->startx, endx = span->endx;
3323 __m128i Color_Ambientm;
3324 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3325 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3326 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3327 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3328 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3329 if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3330 pixel = buffer_FragColorbgra8;
3331 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3332 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3333 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3334 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3335 for (x = startx;x < endx;x++)
3338 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3341 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3342 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3343 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3344 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3350 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3351 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3352 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3354 if (pixel == buffer_FragColorbgra8)
3355 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3361 static void DPSOFTRAST_VertexShader_VertexColor(void)
3363 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3364 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3365 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3368 static void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3371 unsigned char * RESTRICT pixelmask = span->pixelmask;
3372 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3373 int x, startx = span->startx, endx = span->endx;
3374 __m128i Color_Ambientm, Color_Diffusem;
3376 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3377 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3378 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3379 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3380 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3381 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3382 if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3383 pixel = buffer_FragColorbgra8;
3384 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3385 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3386 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3387 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3388 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3389 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3390 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3391 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3392 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3393 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3394 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3395 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3396 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3397 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3399 __m128i color, mod, pix;
3400 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3403 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3404 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3405 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3406 data = _mm_add_ps(data, slope);
3407 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3408 data = _mm_add_ps(data, slope);
3409 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3410 data = _mm_add_ps(data, slope);
3411 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3412 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3413 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3414 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3415 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3416 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3422 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3423 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3424 mod = _mm_packs_epi32(mod, mod);
3425 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3426 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3428 if (pixel == buffer_FragColorbgra8)
3429 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3435 static void DPSOFTRAST_VertexShader_Lightmap(void)
3437 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3438 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3439 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3442 static void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3445 unsigned char * RESTRICT pixelmask = span->pixelmask;
3446 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3447 int x, startx = span->startx, endx = span->endx;
3448 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3449 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3450 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3451 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3452 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3453 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3454 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3455 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3456 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3457 if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3458 pixel = buffer_FragColorbgra8;
3459 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3460 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3461 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3462 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3463 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3464 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3465 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3466 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3468 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3469 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3470 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3471 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3472 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3473 for (x = startx;x < endx;x++)
3475 __m128i color, lightmap, glow, pix;
3476 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3479 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3480 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3481 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3482 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3483 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3484 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3485 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3486 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3487 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3488 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3494 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3495 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3496 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3497 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3498 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3499 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3504 for (x = startx;x < endx;x++)
3506 __m128i color, lightmap, pix;
3507 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3510 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3511 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3512 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3513 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3514 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3515 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3516 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3522 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3523 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3524 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3525 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3528 if (pixel == buffer_FragColorbgra8)
3529 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3534 void DPSOFTRAST_VertexShader_LightDirection(void);
3535 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3537 static void DPSOFTRAST_VertexShader_FakeLight(void)
3539 DPSOFTRAST_VertexShader_LightDirection();
3542 static void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3544 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3549 static void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3551 DPSOFTRAST_VertexShader_LightDirection();
3552 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3555 static void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3557 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3562 static void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3564 DPSOFTRAST_VertexShader_LightDirection();
3565 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3568 static void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3570 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3575 void DPSOFTRAST_VertexShader_LightDirection(void)
3578 int numvertices = dpsoftrast.numvertices;
3580 float LightVector[4];
3581 float EyePosition[4];
3582 float EyeVectorModelSpace[4];
3588 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3589 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3590 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3591 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3592 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3593 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3594 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3595 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3596 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3597 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3598 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3599 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3600 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3601 for (i = 0;i < numvertices;i++)
3603 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3604 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3605 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3606 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3607 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3608 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3609 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3610 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3611 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3612 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3613 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3614 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3615 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3616 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3617 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3618 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3619 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3620 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3621 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3622 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3623 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3624 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3625 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3626 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3627 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3628 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3629 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3630 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3631 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3633 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3636 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3637 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3638 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3639 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3640 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3641 #define DPSOFTRAST_Vector3Normalize(v)\
3644 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3655 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3657 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3658 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3659 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3660 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3661 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3662 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3663 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3664 unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3665 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3666 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3667 int x, startx = span->startx, endx = span->endx;
3668 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3669 float LightVectordata[4];
3670 float LightVectorslope[4];
3671 float EyeVectordata[4];
3672 float EyeVectorslope[4];
3673 float VectorSdata[4];
3674 float VectorSslope[4];
3675 float VectorTdata[4];
3676 float VectorTslope[4];
3677 float VectorRdata[4];
3678 float VectorRslope[4];
3680 float diffusetex[4];
3682 float surfacenormal[4];
3683 float lightnormal[4];
3684 float lightnormal_modelspace[4];
3686 float specularnormal[4];
3689 float SpecularPower;
3691 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3692 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3693 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3694 Color_Glow[3] = 0.0f;
3695 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3696 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3697 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3698 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3699 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3700 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3701 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3702 Color_Pants[3] = 0.0f;
3703 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3704 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3705 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3706 Color_Shirt[3] = 0.0f;
3707 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3708 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3709 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3711 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3712 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3714 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3716 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3718 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3720 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3721 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3722 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3723 Color_Diffuse[3] = 0.0f;
3724 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3725 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3726 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3727 LightColor[3] = 0.0f;
3728 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3729 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3730 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3731 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3732 Color_Specular[3] = 0.0f;
3733 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3734 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3735 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3737 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3739 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3740 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3741 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3742 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3743 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3745 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3747 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3748 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3750 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3752 // nothing of this needed
3756 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3759 for (x = startx;x < endx;x++)
3762 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3763 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3764 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3765 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3766 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3768 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3769 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3770 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3771 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3773 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3774 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3775 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3776 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3777 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3778 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3779 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3780 DPSOFTRAST_Vector3Normalize(surfacenormal);
3782 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3784 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3785 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3786 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3787 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3789 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3790 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3791 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3792 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3794 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3795 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3796 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3797 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3799 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3800 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3801 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3802 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3804 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3805 DPSOFTRAST_Vector3Normalize(lightnormal);
3807 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3809 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3810 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3811 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3812 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3815 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3817 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3818 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3819 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3821 float f = 1.0f / 256.0f;
3822 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3823 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3824 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3827 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3829 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3830 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3831 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3832 DPSOFTRAST_Vector3Normalize(lightnormal);
3834 LightColor[0] = 1.0;
3835 LightColor[1] = 1.0;
3836 LightColor[2] = 1.0;
3840 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3841 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3842 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3843 DPSOFTRAST_Vector3Normalize(lightnormal);
3846 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3848 if(thread->shader_exactspecularmath)
3850 // reflect lightnormal at surfacenormal, take the negative of that
3851 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3853 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3854 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3855 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3856 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3858 // dot of this and normalize(EyeVectorFogDepth.xyz)
3859 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3860 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3861 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3862 DPSOFTRAST_Vector3Normalize(eyenormal);
3864 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3868 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3869 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3870 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3871 DPSOFTRAST_Vector3Normalize(eyenormal);
3873 specularnormal[0] = lightnormal[0] + eyenormal[0];
3874 specularnormal[1] = lightnormal[1] + eyenormal[1];
3875 specularnormal[2] = lightnormal[2] + eyenormal[2];
3876 DPSOFTRAST_Vector3Normalize(specularnormal);
3878 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3880 specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
3882 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3884 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3885 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3886 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3887 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3891 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3892 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3893 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3894 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3897 buffer_FragColorbgra8[x*4+0] = d[0];
3898 buffer_FragColorbgra8[x*4+1] = d[1];
3899 buffer_FragColorbgra8[x*4+2] = d[2];
3900 buffer_FragColorbgra8[x*4+3] = d[3];
3903 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3905 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3906 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3907 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3908 Color_Diffuse[3] = 0.0f;
3909 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3910 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3911 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3912 LightColor[3] = 0.0f;
3913 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3915 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3917 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3918 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3919 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3920 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3921 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3923 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3925 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3926 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3928 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3930 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3934 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3937 for (x = startx;x < endx;x++)
3940 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3941 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3942 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3943 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3944 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3945 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3946 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3947 DPSOFTRAST_Vector3Normalize(surfacenormal);
3949 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3951 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3952 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3953 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3954 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3956 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3957 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3958 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3959 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3961 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3962 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3963 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3964 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3966 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3967 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3968 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3969 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3971 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3972 DPSOFTRAST_Vector3Normalize(lightnormal);
3974 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3976 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3977 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3978 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3979 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3982 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3984 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3985 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3986 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3988 float f = 1.0f / 256.0f;
3989 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3990 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3991 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3994 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3996 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3997 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3998 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3999 DPSOFTRAST_Vector3Normalize(lightnormal);
4001 LightColor[0] = 1.0;
4002 LightColor[1] = 1.0;
4003 LightColor[2] = 1.0;
4007 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4008 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4009 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4010 DPSOFTRAST_Vector3Normalize(lightnormal);
4013 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4014 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4016 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4017 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4018 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4019 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4023 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4024 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4025 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4026 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4028 buffer_FragColorbgra8[x*4+0] = d[0];
4029 buffer_FragColorbgra8[x*4+1] = d[1];
4030 buffer_FragColorbgra8[x*4+2] = d[2];
4031 buffer_FragColorbgra8[x*4+3] = d[3];
4036 for (x = startx;x < endx;x++)
4039 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4040 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4041 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4042 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4044 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4046 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4047 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4048 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4049 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4053 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4054 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4055 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4056 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4058 buffer_FragColorbgra8[x*4+0] = d[0];
4059 buffer_FragColorbgra8[x*4+1] = d[1];
4060 buffer_FragColorbgra8[x*4+2] = d[2];
4061 buffer_FragColorbgra8[x*4+3] = d[3];
4064 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4069 static void DPSOFTRAST_VertexShader_LightSource(void)
4072 int numvertices = dpsoftrast.numvertices;
4073 float LightPosition[4];
4074 float LightVector[4];
4075 float LightVectorModelSpace[4];
4076 float EyePosition[4];
4077 float EyeVectorModelSpace[4];
4083 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4084 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4085 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4086 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4087 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4088 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4089 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4090 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4091 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4092 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4093 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4094 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4095 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4096 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4097 for (i = 0;i < numvertices;i++)
4099 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4100 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4101 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4102 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4103 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4104 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4105 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4106 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4107 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4108 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4109 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4110 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4111 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4112 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4113 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4114 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4115 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4116 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
4117 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4118 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4119 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4120 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4121 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4122 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4123 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4124 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4125 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4126 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4127 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4128 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4129 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4130 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4132 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4133 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4136 static void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4139 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4140 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4141 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4142 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4143 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4144 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4145 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4146 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4147 int x, startx = span->startx, endx = span->endx;
4148 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4149 float CubeVectordata[4];
4150 float CubeVectorslope[4];
4151 float LightVectordata[4];
4152 float LightVectorslope[4];
4153 float EyeVectordata[4];
4154 float EyeVectorslope[4];
4156 float diffusetex[4];
4158 float surfacenormal[4];
4159 float lightnormal[4];
4161 float specularnormal[4];
4164 float SpecularPower;
4165 float CubeVector[4];
4168 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4169 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4170 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4171 Color_Glow[3] = 0.0f;
4172 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4173 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4174 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4175 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4176 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4177 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4178 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4179 Color_Diffuse[3] = 0.0f;
4180 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4181 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4182 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4183 Color_Specular[3] = 0.0f;
4184 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4185 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4186 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4187 Color_Pants[3] = 0.0f;
4188 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4189 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4190 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4191 Color_Shirt[3] = 0.0f;
4192 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4193 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4194 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4195 LightColor[3] = 0.0f;
4196 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4197 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4198 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4199 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4200 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4201 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4202 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4203 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4205 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4206 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4208 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4209 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4210 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4212 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4213 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4214 for (x = startx;x < endx;x++)
4217 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4218 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4219 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4220 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4221 if (attenuation < 0.01f)
4223 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4225 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4226 if (attenuation < 0.01f)
4230 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4231 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4232 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4233 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4234 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4236 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4237 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4238 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4239 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4241 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4242 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4243 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4244 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4245 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4246 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4247 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4248 DPSOFTRAST_Vector3Normalize(surfacenormal);
4250 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4251 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4252 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4253 DPSOFTRAST_Vector3Normalize(lightnormal);
4255 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4257 if(thread->shader_exactspecularmath)
4259 // reflect lightnormal at surfacenormal, take the negative of that
4260 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4262 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4263 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4264 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4265 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4267 // dot of this and normalize(EyeVectorFogDepth.xyz)
4268 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4269 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4270 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4271 DPSOFTRAST_Vector3Normalize(eyenormal);
4273 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4277 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4278 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4279 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4280 DPSOFTRAST_Vector3Normalize(eyenormal);
4282 specularnormal[0] = lightnormal[0] + eyenormal[0];
4283 specularnormal[1] = lightnormal[1] + eyenormal[1];
4284 specularnormal[2] = lightnormal[2] + eyenormal[2];
4285 DPSOFTRAST_Vector3Normalize(specularnormal);
4287 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4289 specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
4291 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4293 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4294 attenuation *= (1.0f / 255.0f);
4295 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4296 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4297 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4298 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4302 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4303 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4304 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4305 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4307 buffer_FragColorbgra8[x*4+0] = d[0];
4308 buffer_FragColorbgra8[x*4+1] = d[1];
4309 buffer_FragColorbgra8[x*4+2] = d[2];
4310 buffer_FragColorbgra8[x*4+3] = d[3];
4313 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4315 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4316 for (x = startx;x < endx;x++)
4319 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4320 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4321 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4322 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4323 if (attenuation < 0.01f)
4325 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4327 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4328 if (attenuation < 0.01f)
4332 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4333 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4334 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4335 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4336 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4338 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4339 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4340 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4341 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4343 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4344 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4345 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4346 DPSOFTRAST_Vector3Normalize(surfacenormal);
4348 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4349 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4350 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4351 DPSOFTRAST_Vector3Normalize(lightnormal);
4353 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4354 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4356 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4357 attenuation *= (1.0f / 255.0f);
4358 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4359 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4360 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4361 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4365 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4366 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4367 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4368 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4370 buffer_FragColorbgra8[x*4+0] = d[0];
4371 buffer_FragColorbgra8[x*4+1] = d[1];
4372 buffer_FragColorbgra8[x*4+2] = d[2];
4373 buffer_FragColorbgra8[x*4+3] = d[3];
4378 for (x = startx;x < endx;x++)
4381 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4382 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4383 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4384 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4385 if (attenuation < 0.01f)
4387 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4389 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4390 if (attenuation < 0.01f)
4394 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4395 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4396 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4397 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4398 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4400 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4401 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4402 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4403 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4405 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4407 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4408 attenuation *= (1.0f / 255.0f);
4409 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4410 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4411 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4412 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4416 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4417 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4418 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4419 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4421 buffer_FragColorbgra8[x*4+0] = d[0];
4422 buffer_FragColorbgra8[x*4+1] = d[1];
4423 buffer_FragColorbgra8[x*4+2] = d[2];
4424 buffer_FragColorbgra8[x*4+3] = d[3];
4427 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4433 static void DPSOFTRAST_VertexShader_Refraction(void)
4435 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4436 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4437 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4440 static void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4442 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4444 int x, startx = span->startx, endx = span->endx;
4447 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4448 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4451 float ModelViewProjectionPositiondata[4];
4452 float ModelViewProjectionPositionslope[4];
4455 float ScreenScaleRefractReflect[2];
4456 float ScreenCenterRefractReflect[2];
4457 float DistortScaleRefractReflect[2];
4458 float RefractColor[4];
4460 DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4461 if(!texture) return;
4464 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4465 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4468 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4471 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4472 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4473 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4474 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4475 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4476 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4477 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4478 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4479 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4480 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4483 for (x = startx;x < endx;x++)
4485 float SafeScreenTexCoord[2];
4486 float ScreenTexCoord[2];
4493 // " vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4494 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4496 // " vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4497 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4498 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4500 // " vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4501 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4502 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4503 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4504 DPSOFTRAST_Vector3Normalize(v);
4505 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4506 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4508 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4509 DPSOFTRAST_Texture2DBGRA8(texture, 0, ScreenTexCoord[0], ScreenTexCoord[1], c);
4511 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4512 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4513 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4514 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4517 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4522 static void DPSOFTRAST_VertexShader_Water(void)
4525 int numvertices = dpsoftrast.numvertices;
4526 float EyePosition[4];
4527 float EyeVectorModelSpace[4];
4533 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4534 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4535 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4536 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4537 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4538 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4539 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4540 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4541 for (i = 0;i < numvertices;i++)
4543 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4544 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4545 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4546 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4547 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4548 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4549 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4550 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4551 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4552 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4553 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4554 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4555 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4556 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4557 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4558 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4559 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4560 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4561 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
4562 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
4563 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
4564 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
4566 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4567 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4568 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4572 static void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4574 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4576 int x, startx = span->startx, endx = span->endx;
4579 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4580 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4583 float ModelViewProjectionPositiondata[4];
4584 float ModelViewProjectionPositionslope[4];
4585 float EyeVectordata[4];
4586 float EyeVectorslope[4];
4589 float ScreenScaleRefractReflect[4];
4590 float ScreenCenterRefractReflect[4];
4591 float DistortScaleRefractReflect[4];
4592 float RefractColor[4];
4593 float ReflectColor[4];
4594 float ReflectFactor;
4595 float ReflectOffset;
4597 DPSOFTRAST_Texture *texture_refraction = thread->texbound[GL20TU_REFRACTION];
4598 DPSOFTRAST_Texture *texture_reflection = thread->texbound[GL20TU_REFLECTION];
4599 if(!texture_refraction || !texture_reflection) return;
4602 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4603 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4606 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4607 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
4610 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4611 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4612 ScreenScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+2];
4613 ScreenScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+3];
4614 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4615 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4616 ScreenCenterRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+2];
4617 ScreenCenterRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+3];
4618 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4619 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4620 DistortScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+2];
4621 DistortScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+3];
4622 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4623 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4624 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4625 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4626 ReflectColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+2];
4627 ReflectColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+1];
4628 ReflectColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+0];
4629 ReflectColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+3];
4630 ReflectFactor = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectFactor*4+0];
4631 ReflectOffset = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectOffset*4+0];
4634 for (x = startx;x < endx;x++)
4636 float SafeScreenTexCoord[4];
4637 float ScreenTexCoord[4];
4640 unsigned char c1[4];
4641 unsigned char c2[4];
4646 // " vec4 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect * (1.0 / ModelViewProjectionPosition.w);\n"
4647 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4649 // " vec4 SafeScreenTexCoord = ModelViewProjectionPosition.xyxy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect;\n"
4650 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4651 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4652 SafeScreenTexCoord[2] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[2] + ScreenCenterRefractReflect[2]; // * z (disappears)
4653 SafeScreenTexCoord[3] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[3] + ScreenCenterRefractReflect[3]; // * z (disappears)
4655 // " vec4 ScreenTexCoord = SafeScreenTexCoord + vec2(normalize(vec3(dp_texture2D(Texture_Normal, TexCoord)) - vec3(0.5))).xyxy * DistortScaleRefractReflect;\n"
4656 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4657 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4658 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4659 DPSOFTRAST_Vector3Normalize(v);
4660 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4661 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4662 ScreenTexCoord[2] = SafeScreenTexCoord[2] + v[0] * DistortScaleRefractReflect[2];
4663 ScreenTexCoord[3] = SafeScreenTexCoord[3] + v[1] * DistortScaleRefractReflect[3];
4665 // " float Fresnel = pow(min(1.0, 1.0 - float(normalize(EyeVector).z)), 2.0) * ReflectFactor + ReflectOffset;\n"
4666 v[0] = (EyeVectordata[0] + EyeVectorslope[0] * x); // * z (disappears)
4667 v[1] = (EyeVectordata[1] + EyeVectorslope[1] * x); // * z (disappears)
4668 v[2] = (EyeVectordata[2] + EyeVectorslope[2] * x); // * z (disappears)
4669 DPSOFTRAST_Vector3Normalize(v);
4670 Fresnel = 1.0f - v[2];
4671 Fresnel = min(1.0f, Fresnel);
4672 Fresnel = Fresnel * Fresnel * ReflectFactor + ReflectOffset;
4674 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4675 // " dp_FragColor = mix(vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord.xy).rgb, 1) * RefractColor, vec4(dp_texture2D(Texture_Reflection, ScreenTexCoord.zw).rgb, 1) * ReflectColor, Fresnel);\n"
4676 DPSOFTRAST_Texture2DBGRA8(texture_refraction, 0, ScreenTexCoord[0], ScreenTexCoord[1], c1);
4677 DPSOFTRAST_Texture2DBGRA8(texture_reflection, 0, ScreenTexCoord[2], ScreenTexCoord[3], c2);
4679 buffer_FragColorbgra8[x*4+0] = (c1[0] * RefractColor[0]) * (1.0f - Fresnel) + (c2[0] * ReflectColor[0]) * Fresnel;
4680 buffer_FragColorbgra8[x*4+1] = (c1[1] * RefractColor[1]) * (1.0f - Fresnel) + (c2[1] * ReflectColor[1]) * Fresnel;
4681 buffer_FragColorbgra8[x*4+2] = (c1[2] * RefractColor[2]) * (1.0f - Fresnel) + (c2[2] * ReflectColor[2]) * Fresnel;
4682 buffer_FragColorbgra8[x*4+3] = min(( RefractColor[3] * (1.0f - Fresnel) + ReflectColor[3] * Fresnel) * 256, 255);
4685 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4690 static void DPSOFTRAST_VertexShader_ShowDepth(void)
4692 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4695 static void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4698 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4699 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4700 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4701 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4702 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4707 static void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4709 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4712 static void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4715 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4716 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4717 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4718 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4719 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4724 static void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4726 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4729 static void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4732 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4733 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4734 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4735 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4736 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4741 typedef struct DPSOFTRAST_ShaderModeInfo_s
4744 void (*Vertex)(void);
4745 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4746 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4747 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4749 DPSOFTRAST_ShaderModeInfo;
4751 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4753 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4754 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4755 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4756 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4757 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4758 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4759 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4760 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4761 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4762 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4763 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4764 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4765 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4766 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4767 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_NORMAL, GL20TU_REFLECTION, GL20TU_REFRACTION, ~0}},
4768 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4769 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4770 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4773 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4778 unsigned int *depthpixel;
4782 unsigned char *pixelmask;
4783 DPSOFTRAST_State_Triangle *triangle;
4784 triangle = &thread->triangles[span->triangle];
4785 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4786 startx = span->startx;
4788 depth = span->depthbase;
4789 depthslope = span->depthslope;
4790 pixelmask = thread->pixelmaskarray;
4791 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4793 switch(thread->fb_depthfunc)
4796 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4797 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4798 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4799 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4800 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4801 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4802 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4804 while (startx < endx && !pixelmask[startx])
4806 while (endx > startx && !pixelmask[endx-1])
4811 // no depth testing means we're just dealing with color...
4812 memset(pixelmask + startx, 1, endx - startx);
4814 span->pixelmask = pixelmask;
4815 span->startx = startx;
4819 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4821 int x, d, depth, depthslope, startx, endx;
4822 const unsigned char *pixelmask;
4823 unsigned int *depthpixel;
4824 if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4826 depth = span->depthbase;
4827 depthslope = span->depthslope;
4828 pixelmask = span->pixelmask;
4829 startx = span->startx;
4831 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4832 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4838 static void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4841 DPSOFTRAST_State_Triangle *triangle;
4842 DPSOFTRAST_State_Span *span;
4843 for (i = 0; i < thread->numspans; i++)
4845 span = &thread->spans[i];
4846 triangle = &thread->triangles[span->triangle];
4847 DPSOFTRAST_Draw_DepthTest(thread, span);
4848 if (span->startx >= span->endx)
4850 // run pixel shader if appropriate
4851 // do this before running depthmask code, to allow the pixelshader
4852 // to clear pixelmask values for alpha testing
4853 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4854 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4855 DPSOFTRAST_Draw_DepthWrite(thread, span);
4857 thread->numspans = 0;
4860 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;)
4862 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4865 int cullface = thread->cullface;
4866 int minx, maxx, miny, maxy;
4867 int miny1, maxy1, miny2, maxy2;
4868 __m128i fbmin, fbmax;
4869 __m128 viewportcenter, viewportscale;
4870 int firstvertex = command->firstvertex;
4871 int numvertices = command->numvertices;
4872 int numtriangles = command->numtriangles;
4873 const int *element3i = command->element3i;
4874 const unsigned short *element3s = command->element3s;
4875 int clipped = command->clipped;
4882 int starty, endy, bandy;
4886 float clip0origin, clip0slope;
4888 __m128 triangleedge1, triangleedge2, trianglenormal;
4891 DPSOFTRAST_State_Triangle *triangle;
4892 DPSOFTRAST_Texture *texture;
4893 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4894 miny = thread->fb_scissor[1];
4895 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4896 miny1 = bound(miny, thread->miny1, maxy);
4897 maxy1 = bound(miny, thread->maxy1, maxy);
4898 miny2 = bound(miny, thread->miny2, maxy);
4899 maxy2 = bound(miny, thread->maxy2, maxy);
4900 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4902 if (!ATOMIC_DECREMENT(command->refcount))
4904 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4905 MM_FREE(command->arrays);
4909 minx = thread->fb_scissor[0];
4910 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4911 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4912 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4913 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4914 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4915 screen[3] = _mm_setzero_ps();
4916 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4917 for (i = 0;i < numtriangles;i++)
4919 const float *screencoord4f = command->arrays;
4920 const float *arrays = screencoord4f + numvertices*4;
4922 // generate the 3 edges of this triangle
4923 // generate spans for the triangle - switch based on left split or right split classification of triangle
4926 e[0] = element3s[i*3+0] - firstvertex;
4927 e[1] = element3s[i*3+1] - firstvertex;
4928 e[2] = element3s[i*3+2] - firstvertex;
4932 e[0] = element3i[i*3+0] - firstvertex;
4933 e[1] = element3i[i*3+1] - firstvertex;
4934 e[2] = element3i[i*3+2] - firstvertex;
4943 #define SKIPBACKFACE \
4944 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4945 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4946 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4947 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4948 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4952 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4956 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4961 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4962 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4964 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4965 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4967 #define CLIPPEDVERTEXCOPY(k,p1) \
4968 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4970 #define GENATTRIBCOPY(attrib, p1) \
4971 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4972 #define GENATTRIBLERP(attrib, p1, p2) \
4974 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4975 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4977 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4981 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4982 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4983 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4984 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4985 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4986 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4987 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4993 // calculate distance from nearplane
4994 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4995 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4996 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4997 if (clipdist[0] >= 0.0f)
4999 if (clipdist[1] >= 0.0f)
5001 if (clipdist[2] >= 0.0f)
5004 // triangle is entirely in front of nearplane
5005 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
5012 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
5020 if (clipdist[2] >= 0.0f)
5022 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
5029 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
5036 else if (clipdist[1] >= 0.0f)
5038 if (clipdist[2] >= 0.0f)
5040 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
5047 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
5053 else if (clipdist[2] >= 0.0f)
5055 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
5060 else continue; // triangle is entirely behind nearplane
5063 // calculate integer y coords for triangle points
5064 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
5065 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
5066 screenmin = _mm_min_epi16(screeni, screenir),
5067 screenmax = _mm_max_epi16(screeni, screenir);
5068 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
5069 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
5070 screenmin = _mm_max_epi16(screenmin, fbmin);
5071 screenmax = _mm_min_epi16(screenmax, fbmax);
5072 // skip offscreen triangles
5073 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5075 starty = _mm_extract_epi16(screenmin, 1);
5076 endy = _mm_extract_epi16(screenmax, 1)+1;
5077 if (starty >= maxy1 && endy <= miny2)
5079 screeny = _mm_srai_epi32(screeni, 16);
5082 triangle = &thread->triangles[thread->numtriangles];
5084 // calculate attribute plans for triangle data...
5085 // okay, this triangle is going to produce spans, we'd better project
5086 // the interpolants now (this is what gives perspective texturing),
5087 // this consists of simply multiplying all arrays by the W coord
5088 // (which is basically 1/Z), which will be undone per-pixel
5089 // (multiplying by Z again) to get the perspective-correct array
5092 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5093 __m128 mipedgescale, mipdensity;
5094 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5095 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5096 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5097 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5098 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5099 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5100 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5101 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5102 attribedge1 = _mm_sub_ss(w0, w1);
5103 attribedge2 = _mm_sub_ss(w2, w1);
5104 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5105 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5106 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5107 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5108 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5109 _mm_store_ss(&triangle->w[0], attribxslope);
5110 _mm_store_ss(&triangle->w[1], attribyslope);
5111 _mm_store_ss(&triangle->w[2], attriborigin);
5116 if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
5118 float cliporigin, clipxslope, clipyslope;
5119 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5120 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5121 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5122 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5123 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5124 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5125 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
5126 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
5127 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
5130 clip0origin = -cliporigin/clipxslope;
5131 clip0slope = -clipyslope/clipxslope;
5132 clip0dir = clipxslope > 0 ? 1 : -1;
5134 else if(clipyslope > 0)
5136 clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5137 clip0slope = dpsoftrast.fb_width;
5140 else if(clipyslope < 0)
5142 clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5143 clip0slope = -dpsoftrast.fb_width;
5146 else if(clip0origin < 0) continue;
5149 mipedgescale = _mm_setzero_ps();
5150 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5152 __m128 attrib0, attrib1, attrib2;
5153 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5154 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5156 arrays += numvertices*4;
5157 GENATTRIBS(attrib0, attrib1, attrib2);
5158 attriborigin = _mm_mul_ps(attrib1, w1);
5159 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5160 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5161 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5162 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5163 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5164 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5165 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5166 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5167 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5169 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5170 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5171 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5172 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5176 memset(triangle->mip, 0, sizeof(triangle->mip));
5177 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5179 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5180 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5182 texture = thread->texbound[texunit];
5183 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5185 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5186 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5187 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5188 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5189 // this will be multiplied in the texturing routine by the texture resolution
5190 y = _mm_cvtss_si32(mipdensity);
5193 y = (int)(log((float)y)*0.5f/M_LN2);
5194 if (y > texture->mipmaps - 1)
5195 y = texture->mipmaps - 1;
5196 triangle->mip[texunit] = y;
5202 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5205 __m128 xcoords, xslope;
5206 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5207 int yccmask = _mm_movemask_epi8(ycc);
5208 int edge0p, edge0n, edge1p, edge1n;
5217 case 0xFFFF: /*0000*/ y = endy; continue;
5218 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5219 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5220 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5221 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5222 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5223 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5224 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5225 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5226 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5227 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5228 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5229 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5230 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5231 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5232 case 0x0000: /*1111*/ y++; continue;
5240 case 0xFFFF: /*000*/ y = endy; continue;
5241 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5242 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5243 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5244 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5245 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5246 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5247 case 0x0000: /*111*/ y++; continue;
5250 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5251 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5252 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5253 nexty = _mm_extract_epi16(ycc, 0);
5254 if (nexty >= bandy) nexty = bandy-1;
5255 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5256 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5257 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5258 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5259 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5260 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5262 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5263 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5265 clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5266 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5268 int startx, endx, offset;
5269 startx = _mm_cvtss_si32(xcoords);
5270 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5271 if (startx < minx) startx = minx;
5272 if (endx > maxx) endx = maxx;
5273 if (startx >= endx) continue;
5281 if(endx <= clip0) continue;
5282 startx = (int)clip0;
5285 else if (endx > clip0)
5287 if(startx >= clip0) continue;
5292 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5294 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5295 span->triangle = thread->numtriangles;
5299 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5300 if (span->startx >= span->endx)
5302 wslope = triangle->w[0];
5303 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5304 span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5305 span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5306 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5307 DPSOFTRAST_Draw_ProcessSpans(thread);
5312 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5314 DPSOFTRAST_Draw_ProcessSpans(thread);
5315 thread->numtriangles = 0;
5319 if (!ATOMIC_DECREMENT(command->refcount))
5321 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5322 MM_FREE(command->arrays);
5325 if (thread->numspans > 0 || thread->numtriangles > 0)
5327 DPSOFTRAST_Draw_ProcessSpans(thread);
5328 thread->numtriangles = 0;
5333 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5337 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5338 int datasize = 2*numvertices*sizeof(float[4]);
5339 DPSOFTRAST_Command_Draw *command;
5340 unsigned char *data;
5341 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5343 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5344 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5346 datasize += numvertices*sizeof(float[4]);
5349 datasize += numtriangles*sizeof(unsigned short[3]);
5351 datasize += numtriangles*sizeof(int[3]);
5352 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5353 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5355 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5356 data = (unsigned char *)MM_CALLOC(datasize, 1);
5360 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5361 data = (unsigned char *)command + commandsize;
5363 command->firstvertex = firstvertex;
5364 command->numvertices = numvertices;
5365 command->numtriangles = numtriangles;
5366 command->arrays = (float *)data;
5367 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5368 dpsoftrast.firstvertex = firstvertex;
5369 dpsoftrast.numvertices = numvertices;
5370 dpsoftrast.screencoord4f = (float *)data;
5371 data += numvertices*sizeof(float[4]);
5372 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5373 data += numvertices*sizeof(float[4]);
5374 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5376 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5377 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5379 dpsoftrast.post_array4f[j] = (float *)data;
5380 data += numvertices*sizeof(float[4]);
5382 command->element3i = NULL;
5383 command->element3s = NULL;
5386 command->element3s = (unsigned short *)data;
5387 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5391 command->element3i = (int *)data;
5392 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5397 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5399 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5400 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5401 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5402 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5403 if (command->starty >= command->endy)
5405 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5406 MM_FREE(command->arrays);
5407 DPSOFTRAST_UndoCommand(command->commandsize);
5410 command->clipped = dpsoftrast.drawclipped;
5411 command->refcount = dpsoftrast.numthreads;
5413 if (dpsoftrast.usethreads)
5416 DPSOFTRAST_Draw_SyncCommands();
5417 for (i = 0; i < dpsoftrast.numthreads; i++)
5419 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5420 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5421 Thread_CondSignal(thread->drawcond);
5426 DPSOFTRAST_Draw_FlushThreads();
5430 DEFCOMMAND(23, SetRenderTargets, int width; int height;)
5431 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5433 thread->validate |= DPSOFTRAST_VALIDATE_FB;
5435 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5437 DPSOFTRAST_Command_SetRenderTargets *command;
5438 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5439 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5440 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5442 dpsoftrast.fb_width = width;
5443 dpsoftrast.fb_height = height;
5444 dpsoftrast.fb_depthpixels = depthpixels;
5445 dpsoftrast.fb_colorpixels[0] = colorpixels0;
5446 dpsoftrast.fb_colorpixels[1] = colorpixels1;
5447 dpsoftrast.fb_colorpixels[2] = colorpixels2;
5448 dpsoftrast.fb_colorpixels[3] = colorpixels3;
5449 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5450 command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5451 command->width = width;
5452 command->height = height;
5455 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5457 int commandoffset = thread->commandoffset;
5458 while (commandoffset != endoffset)
5460 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5461 switch (command->opcode)
5463 #define INTERPCOMMAND(name) \
5464 case DPSOFTRAST_OPCODE_##name : \
5465 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5466 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5467 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5468 commandoffset = 0; \
5470 INTERPCOMMAND(Viewport)
5471 INTERPCOMMAND(ClearColor)
5472 INTERPCOMMAND(ClearDepth)
5473 INTERPCOMMAND(ColorMask)
5474 INTERPCOMMAND(DepthTest)
5475 INTERPCOMMAND(ScissorTest)
5476 INTERPCOMMAND(Scissor)
5477 INTERPCOMMAND(BlendFunc)
5478 INTERPCOMMAND(BlendSubtract)
5479 INTERPCOMMAND(DepthMask)
5480 INTERPCOMMAND(DepthFunc)
5481 INTERPCOMMAND(DepthRange)
5482 INTERPCOMMAND(PolygonOffset)
5483 INTERPCOMMAND(CullFace)
5484 INTERPCOMMAND(SetTexture)
5485 INTERPCOMMAND(SetShader)
5486 INTERPCOMMAND(Uniform4f)
5487 INTERPCOMMAND(UniformMatrix4f)
5488 INTERPCOMMAND(Uniform1i)
5489 INTERPCOMMAND(SetRenderTargets)
5490 INTERPCOMMAND(ClipPlane)
5492 case DPSOFTRAST_OPCODE_Draw:
5493 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5494 commandoffset += command->commandsize;
5495 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5497 thread->commandoffset = commandoffset;
5500 case DPSOFTRAST_OPCODE_Reset:
5505 thread->commandoffset = commandoffset;
5508 static int DPSOFTRAST_Draw_Thread(void *data)
5510 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5511 while(thread->index >= 0)
5513 if (thread->commandoffset != dpsoftrast.drawcommand)
5515 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5519 Thread_LockMutex(thread->drawmutex);
5520 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5522 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5523 thread->starving = true;
5524 Thread_CondWait(thread->drawcond, thread->drawmutex);
5525 thread->starving = false;
5527 Thread_UnlockMutex(thread->drawmutex);
5533 static void DPSOFTRAST_Draw_FlushThreads(void)
5535 DPSOFTRAST_State_Thread *thread;
5537 DPSOFTRAST_Draw_SyncCommands();
5538 if (dpsoftrast.usethreads)
5540 for (i = 0; i < dpsoftrast.numthreads; i++)
5542 thread = &dpsoftrast.threads[i];
5543 if (thread->commandoffset != dpsoftrast.drawcommand)
5545 Thread_LockMutex(thread->drawmutex);
5546 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5547 Thread_CondSignal(thread->drawcond);
5548 Thread_UnlockMutex(thread->drawmutex);
5551 for (i = 0; i < dpsoftrast.numthreads; i++)
5553 thread = &dpsoftrast.threads[i];
5554 if (thread->commandoffset != dpsoftrast.drawcommand)
5556 Thread_LockMutex(thread->drawmutex);
5557 if (thread->commandoffset != dpsoftrast.drawcommand)
5559 thread->waiting = true;
5560 Thread_CondWait(thread->waitcond, thread->drawmutex);
5561 thread->waiting = false;
5563 Thread_UnlockMutex(thread->drawmutex);
5569 for (i = 0; i < dpsoftrast.numthreads; i++)
5571 thread = &dpsoftrast.threads[i];
5572 if (thread->commandoffset != dpsoftrast.drawcommand)
5573 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5576 dpsoftrast.commandpool.usedcommands = 0;
5579 void DPSOFTRAST_Flush(void)
5581 DPSOFTRAST_Draw_FlushThreads();
5584 void DPSOFTRAST_Finish(void)
5589 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5599 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5600 dpsoftrast.bigendian = u.b[3];
5601 dpsoftrast.fb_width = width;
5602 dpsoftrast.fb_height = height;
5603 dpsoftrast.fb_depthpixels = depthpixels;
5604 dpsoftrast.fb_colorpixels[0] = colorpixels;
5605 dpsoftrast.fb_colorpixels[1] = NULL;
5606 dpsoftrast.fb_colorpixels[1] = NULL;
5607 dpsoftrast.fb_colorpixels[1] = NULL;
5608 dpsoftrast.viewport[0] = 0;
5609 dpsoftrast.viewport[1] = 0;
5610 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5611 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5612 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5613 dpsoftrast.texture_firstfree = 1;
5614 dpsoftrast.texture_end = 1;
5615 dpsoftrast.texture_max = 0;
5616 dpsoftrast.color[0] = 1;
5617 dpsoftrast.color[1] = 1;
5618 dpsoftrast.color[2] = 1;
5619 dpsoftrast.color[3] = 1;
5620 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5621 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5622 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5623 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5624 for (i = 0; i < dpsoftrast.numthreads; i++)
5626 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5628 thread->cullface = GL_BACK;
5629 thread->colormask[0] = 1;
5630 thread->colormask[1] = 1;
5631 thread->colormask[2] = 1;
5632 thread->colormask[3] = 1;
5633 thread->blendfunc[0] = GL_ONE;
5634 thread->blendfunc[1] = GL_ZERO;
5635 thread->depthmask = true;
5636 thread->depthtest = true;
5637 thread->depthfunc = GL_LEQUAL;
5638 thread->scissortest = false;
5639 thread->viewport[0] = 0;
5640 thread->viewport[1] = 0;
5641 thread->viewport[2] = dpsoftrast.fb_width;
5642 thread->viewport[3] = dpsoftrast.fb_height;
5643 thread->scissor[0] = 0;
5644 thread->scissor[1] = 0;
5645 thread->scissor[2] = dpsoftrast.fb_width;
5646 thread->scissor[3] = dpsoftrast.fb_height;
5647 thread->depthrange[0] = 0;
5648 thread->depthrange[1] = 1;
5649 thread->polygonoffset[0] = 0;
5650 thread->polygonoffset[1] = 0;
5651 thread->clipplane[0] = 0;
5652 thread->clipplane[1] = 0;
5653 thread->clipplane[2] = 0;
5654 thread->clipplane[3] = 1;
5656 thread->numspans = 0;
5657 thread->numtriangles = 0;
5658 thread->commandoffset = 0;
5659 thread->waiting = false;
5660 thread->starving = false;
5662 thread->validate = -1;
5663 DPSOFTRAST_Validate(thread, -1);
5665 if (dpsoftrast.usethreads)
5667 thread->waitcond = Thread_CreateCond();
5668 thread->drawcond = Thread_CreateCond();
5669 thread->drawmutex = Thread_CreateMutex();
5670 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5676 void DPSOFTRAST_Shutdown(void)
5679 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5681 DPSOFTRAST_State_Thread *thread;
5682 for (i = 0; i < dpsoftrast.numthreads; i++)
5684 thread = &dpsoftrast.threads[i];
5685 Thread_LockMutex(thread->drawmutex);
5687 Thread_CondSignal(thread->drawcond);
5688 Thread_UnlockMutex(thread->drawmutex);
5689 Thread_WaitThread(thread->thread, 0);
5690 Thread_DestroyCond(thread->waitcond);
5691 Thread_DestroyCond(thread->drawcond);
5692 Thread_DestroyMutex(thread->drawmutex);
5695 for (i = 0;i < dpsoftrast.texture_end;i++)
5696 if (dpsoftrast.texture[i].bytes)
5697 MM_FREE(dpsoftrast.texture[i].bytes);
5698 if (dpsoftrast.texture)
5699 free(dpsoftrast.texture);
5700 if (dpsoftrast.threads)
5701 MM_FREE(dpsoftrast.threads);
5702 memset(&dpsoftrast, 0, sizeof(dpsoftrast));