3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 #pragma warning(disable : 4324)
14 typedef qboolean bool;
21 #if defined(__APPLE__)
22 #include <libkern/OSAtomic.h>
23 #define ALIGN(var) var __attribute__((__aligned__(16)))
24 #define ATOMIC(var) var __attribute__((__aligned__(4)))
25 #define MEMORY_BARRIER (_mm_sfence())
26 #define ATOMIC_COUNTER volatile int32_t
27 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30 #elif defined(__GNUC__) && defined(WIN32)
31 #define ALIGN(var) var __attribute__((__aligned__(16)))
32 #define ATOMIC(var) var __attribute__((__aligned__(4)))
33 #define MEMORY_BARRIER (_mm_sfence())
34 //(__sync_synchronize())
35 #define ATOMIC_COUNTER volatile LONG
36 // this LONG * cast serves to fix an issue with broken mingw
37 // packages on Ubuntu; these only declare the function to take
38 // a LONG *, causing a compile error here. This seems to be
39 // error- and warn-free on platforms that DO declare
40 // InterlockedIncrement correctly, like mingw on Windows.
41 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement((LONG *) &(counter)))
42 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement((LONG *) &(counter)))
43 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd((LONG *) &(counter), (val)))
44 #elif defined(__GNUC__)
45 #define ALIGN(var) var __attribute__((__aligned__(16)))
46 #define ATOMIC(var) var __attribute__((__aligned__(4)))
47 #define MEMORY_BARRIER (_mm_sfence())
48 //(__sync_synchronize())
49 #define ATOMIC_COUNTER volatile int
50 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
51 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
52 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
53 #elif defined(_MSC_VER)
54 #define ALIGN(var) __declspec(align(16)) var
55 #define ATOMIC(var) __declspec(align(4)) var
56 #define MEMORY_BARRIER (_mm_sfence())
58 #define ATOMIC_COUNTER volatile LONG
59 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
60 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
61 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
66 #define ALIGN(var) var
69 #define ATOMIC(var) var
71 #ifndef MEMORY_BARRIER
72 #define MEMORY_BARRIER ((void)0)
74 #ifndef ATOMIC_COUNTER
75 #define ATOMIC_COUNTER int
77 #ifndef ATOMIC_INCREMENT
78 #define ATOMIC_INCREMENT(counter) (++(counter))
80 #ifndef ATOMIC_DECREMENT
81 #define ATOMIC_DECREMENT(counter) (--(counter))
84 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
88 #include <emmintrin.h>
90 #if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6) && !defined(__clang__)
91 #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
94 #define MM_MALLOC(size) _mm_malloc(size, ALIGN_SIZE)
96 static void *MM_CALLOC(size_t nmemb, size_t size)
98 void *ptr = _mm_malloc(nmemb*size, ALIGN_SIZE);
99 if (ptr != NULL) memset(ptr, 0, nmemb*size);
103 #define MM_FREE _mm_free
105 #define MM_MALLOC(size) malloc(size)
106 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
110 typedef enum DPSOFTRAST_ARRAY_e
112 DPSOFTRAST_ARRAY_POSITION,
113 DPSOFTRAST_ARRAY_COLOR,
114 DPSOFTRAST_ARRAY_TEXCOORD0,
115 DPSOFTRAST_ARRAY_TEXCOORD1,
116 DPSOFTRAST_ARRAY_TEXCOORD2,
117 DPSOFTRAST_ARRAY_TEXCOORD3,
118 DPSOFTRAST_ARRAY_TEXCOORD4,
119 DPSOFTRAST_ARRAY_TEXCOORD5,
120 DPSOFTRAST_ARRAY_TEXCOORD6,
121 DPSOFTRAST_ARRAY_TEXCOORD7,
122 DPSOFTRAST_ARRAY_TOTAL
126 typedef struct DPSOFTRAST_Texture_s
133 DPSOFTRAST_TEXTURE_FILTER filter;
136 ATOMIC_COUNTER binds;
137 unsigned char *bytes;
138 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
142 #define COMMAND_SIZE ALIGN_SIZE
143 #define COMMAND_ALIGN(var) ALIGN(var)
145 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
147 unsigned char opcode;
148 unsigned short commandsize;
152 enum { DPSOFTRAST_OPCODE_Reset = 0 };
154 #define DEFCOMMAND(opcodeval, name, fields) \
155 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
156 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
158 unsigned char opcode; \
159 unsigned short commandsize; \
161 } DPSOFTRAST_Command_##name );
163 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
164 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
166 typedef ALIGN(struct DPSOFTRAST_State_Command_Pool_s
170 ALIGN(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
172 DPSOFTRAST_State_Command_Pool);
174 typedef ALIGN(struct DPSOFTRAST_State_Triangle_s
176 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
178 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
180 DPSOFTRAST_State_Triangle);
182 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
183 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
184 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
185 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
186 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
188 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
189 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
190 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
191 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
192 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
193 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
194 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
195 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
196 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
199 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
201 typedef ALIGN(struct DPSOFTRAST_State_Span_s
203 int triangle; // triangle this span was generated by
204 int x; // framebuffer x coord
205 int y; // framebuffer y coord
206 int startx; // usable range (according to pixelmask)
207 int endx; // usable range (according to pixelmask)
208 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
209 int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
210 int depthslope; // depthbuffer value pixel delta
212 DPSOFTRAST_State_Span);
214 #define DPSOFTRAST_DRAW_MAXSPANS 1024
215 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
216 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
218 #define DPSOFTRAST_VALIDATE_FB 1
219 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
220 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
221 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
223 typedef enum DPSOFTRAST_BLENDMODE_e
225 DPSOFTRAST_BLENDMODE_OPAQUE,
226 DPSOFTRAST_BLENDMODE_ALPHA,
227 DPSOFTRAST_BLENDMODE_ADDALPHA,
228 DPSOFTRAST_BLENDMODE_ADD,
229 DPSOFTRAST_BLENDMODE_INVMOD,
230 DPSOFTRAST_BLENDMODE_MUL,
231 DPSOFTRAST_BLENDMODE_MUL2,
232 DPSOFTRAST_BLENDMODE_SUBALPHA,
233 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
234 DPSOFTRAST_BLENDMODE_INVADD,
235 DPSOFTRAST_BLENDMODE_TOTAL
237 DPSOFTRAST_BLENDMODE;
239 typedef ALIGN(struct DPSOFTRAST_State_Thread_s
255 float polygonoffset[2];
257 ALIGN(float fb_clipplane[4]);
260 int shader_permutation;
261 int shader_exactspecularmath;
263 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
265 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
266 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
268 // DPSOFTRAST_VALIDATE_ flags
271 // derived values (DPSOFTRAST_VALIDATE_FB)
274 ALIGN(float fb_viewportcenter[4]);
275 ALIGN(float fb_viewportscale[4]);
277 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
280 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
289 ATOMIC(volatile int commandoffset);
291 volatile bool waiting;
292 volatile bool starving;
299 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
300 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
301 unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
303 DPSOFTRAST_State_Thread);
305 typedef ALIGN(struct DPSOFTRAST_State_s
309 unsigned int *fb_depthpixels;
310 unsigned int *fb_colorpixels[4];
313 ALIGN(float fb_viewportcenter[4]);
314 ALIGN(float fb_viewportscale[4]);
317 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
318 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
320 const float *pointer_vertex3f;
321 const float *pointer_color4f;
322 const unsigned char *pointer_color4ub;
323 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
326 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
327 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
328 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
332 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
333 float *screencoord4f;
339 int shader_permutation;
340 int shader_exactspecularmath;
344 int texture_firstfree;
345 DPSOFTRAST_Texture *texture;
350 const char *errorstring;
355 DPSOFTRAST_State_Thread *threads;
357 ATOMIC(volatile int drawcommand);
359 DPSOFTRAST_State_Command_Pool commandpool;
363 DPSOFTRAST_State dpsoftrast;
365 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
366 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
367 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
368 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
370 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
371 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
373 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
375 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
376 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
377 fb_viewportcenter[3] = 0.5f;
378 fb_viewportcenter[0] = 0.0f;
379 fb_viewportscale[1] = 0.5f * viewport[2];
380 fb_viewportscale[2] = -0.5f * viewport[3];
381 fb_viewportscale[3] = 0.5f;
382 fb_viewportscale[0] = 1.0f;
385 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
387 if (dpsoftrast.interlace)
389 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
390 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
391 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
392 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
396 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
397 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
401 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
403 thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
404 thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
405 thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
406 thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
407 thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
410 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
412 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
413 // and viewport projection values
416 x1 = thread->scissor[0];
417 x2 = thread->scissor[0] + thread->scissor[2];
418 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
419 y2 = dpsoftrast.fb_height - thread->scissor[1];
420 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
422 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
424 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
425 thread->fb_scissor[0] = x1;
426 thread->fb_scissor[1] = y1;
427 thread->fb_scissor[2] = x2 - x1;
428 thread->fb_scissor[3] = y2 - y1;
430 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
431 DPSOFTRAST_RecalcClipPlane(thread);
432 DPSOFTRAST_RecalcThread(thread);
435 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
437 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
440 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
442 if (thread->blendsubtract)
444 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
446 #define BLENDFUNC(sfactor, dfactor, blendmode) \
447 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
448 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
449 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
454 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
456 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
457 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
458 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
459 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
460 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
461 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
462 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
463 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
464 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
465 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
466 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
471 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
473 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
475 mask &= thread->validate;
478 if (mask & DPSOFTRAST_VALIDATE_FB)
480 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
481 DPSOFTRAST_RecalcFB(thread);
483 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
485 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
486 DPSOFTRAST_RecalcDepthFunc(thread);
488 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
490 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
491 DPSOFTRAST_RecalcBlendFunc(thread);
495 static DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
497 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
498 return &dpsoftrast.texture[index];
502 static void DPSOFTRAST_Texture_Grow(void)
504 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
505 DPSOFTRAST_State_Thread *thread;
509 // expand texture array as needed
510 if (dpsoftrast.texture_max < 1024)
511 dpsoftrast.texture_max = 1024;
513 dpsoftrast.texture_max *= 2;
514 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
515 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
516 if (dpsoftrast.texbound[i])
517 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
518 for (j = 0; j < dpsoftrast.numthreads; j++)
520 thread = &dpsoftrast.threads[j];
521 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
522 if (thread->texbound[i])
523 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
527 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
536 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
537 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
538 DPSOFTRAST_Texture *texture;
539 if (width*height*depth < 1)
541 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
544 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
546 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
551 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
552 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
553 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
555 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
556 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
558 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
563 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
566 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
568 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
573 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
575 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
578 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
580 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
583 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
585 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
588 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
590 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
593 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
595 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
598 // find first empty slot in texture array
599 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
600 if (!dpsoftrast.texture[texnum].bytes)
602 dpsoftrast.texture_firstfree = texnum + 1;
603 if (dpsoftrast.texture_max <= texnum)
604 DPSOFTRAST_Texture_Grow();
605 if (dpsoftrast.texture_end <= texnum)
606 dpsoftrast.texture_end = texnum + 1;
607 texture = &dpsoftrast.texture[texnum];
608 memset(texture, 0, sizeof(*texture));
609 texture->flags = flags;
610 texture->width = width;
611 texture->height = height;
612 texture->depth = depth;
613 texture->sides = sides;
622 s = w * h * d * sides * 4;
623 texture->mipmap[mipmaps][0] = size;
624 texture->mipmap[mipmaps][1] = s;
625 texture->mipmap[mipmaps][2] = w;
626 texture->mipmap[mipmaps][3] = h;
627 texture->mipmap[mipmaps][4] = d;
630 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
636 texture->mipmaps = mipmaps;
637 texture->size = size;
639 // allocate the pixels now
640 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
644 void DPSOFTRAST_Texture_Free(int index)
646 DPSOFTRAST_Texture *texture;
647 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
651 MM_FREE(texture->bytes);
652 texture->bytes = NULL;
653 memset(texture, 0, sizeof(*texture));
654 // adjust the free range and used range
655 if (dpsoftrast.texture_firstfree > index)
656 dpsoftrast.texture_firstfree = index;
657 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
658 dpsoftrast.texture_end--;
660 static void DPSOFTRAST_Texture_CalculateMipmaps(int index)
662 int i, x, y, z, w, layer0, layer1, row0, row1;
663 unsigned char *o, *i0, *i1, *i2, *i3;
664 DPSOFTRAST_Texture *texture;
665 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
666 if (texture->mipmaps <= 1)
668 for (i = 1;i < texture->mipmaps;i++)
670 for (z = 0;z < texture->mipmap[i][4];z++)
674 if (layer1 >= texture->mipmap[i-1][4])
675 layer1 = texture->mipmap[i-1][4]-1;
676 for (y = 0;y < texture->mipmap[i][3];y++)
680 if (row1 >= texture->mipmap[i-1][3])
681 row1 = texture->mipmap[i-1][3]-1;
682 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
683 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
684 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
685 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
686 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
687 w = texture->mipmap[i][2];
690 if (texture->mipmap[i-1][2] > 1)
692 // average 3D texture
693 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
695 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
696 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
697 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
698 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
703 // average 3D mipmap with parent width == 1
704 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
706 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
707 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
708 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
709 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
715 if (texture->mipmap[i-1][2] > 1)
717 // average 2D texture (common case)
718 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
720 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
721 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
722 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
723 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
728 // 2D texture with parent width == 1
729 o[0] = (i0[0] + i1[0] + 1) >> 1;
730 o[1] = (i0[1] + i1[1] + 1) >> 1;
731 o[2] = (i0[2] + i1[2] + 1) >> 1;
732 o[3] = (i0[3] + i1[3] + 1) >> 1;
739 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
741 DPSOFTRAST_Texture *texture;
743 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
748 dst = texture->bytes + texture->mipmap[0][1] +(-blocky * texture->mipmap[0][2] + blockx) * 4;
749 while (blockheight > 0)
751 dst -= texture->mipmap[0][2] * 4;
752 memcpy(dst, pixels, blockwidth * 4);
753 pixels += blockwidth * 4;
757 DPSOFTRAST_Texture_CalculateMipmaps(index);
759 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
761 DPSOFTRAST_Texture *texture;
762 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
767 int i, stride = texture->mipmap[0][2]*4;
768 unsigned char *dst = texture->bytes + texture->mipmap[0][1];
769 for (i = texture->mipmap[0][3];i > 0;i--)
772 memcpy(dst, pixels, stride);
776 DPSOFTRAST_Texture_CalculateMipmaps(index);
778 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
780 DPSOFTRAST_Texture *texture;
781 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
782 return texture->mipmap[mip][2];
784 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
786 DPSOFTRAST_Texture *texture;
787 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
788 return texture->mipmap[mip][3];
790 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
792 DPSOFTRAST_Texture *texture;
793 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
794 return texture->mipmap[mip][4];
796 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
798 DPSOFTRAST_Texture *texture;
799 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
802 return texture->bytes + texture->mipmap[mip][0];
804 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
806 DPSOFTRAST_Texture *texture;
807 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
808 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
810 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
815 texture->filter = filter;
818 static void DPSOFTRAST_Draw_FlushThreads(void);
820 static void DPSOFTRAST_Draw_SyncCommands(void)
822 if(dpsoftrast.usethreads) MEMORY_BARRIER;
823 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
826 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
828 DPSOFTRAST_State_Thread *thread;
830 int freecommand = dpsoftrast.commandpool.freecommand;
831 int usedcommands = dpsoftrast.commandpool.usedcommands;
832 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
834 DPSOFTRAST_Draw_SyncCommands();
840 for (i = 0; i < dpsoftrast.numthreads; i++)
842 thread = &dpsoftrast.threads[i];
843 commandoffset = freecommand - thread->commandoffset;
844 if (commandoffset < 0)
845 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
846 if (commandoffset > usedcommands)
849 usedcommands = commandoffset;
852 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
854 thread = &dpsoftrast.threads[waitindex];
855 Thread_LockMutex(thread->drawmutex);
856 if (thread->commandoffset != dpsoftrast.drawcommand)
858 thread->waiting = true;
859 if (thread->starving) Thread_CondSignal(thread->drawcond);
860 Thread_CondWait(thread->waitcond, thread->drawmutex);
861 thread->waiting = false;
863 Thread_UnlockMutex(thread->drawmutex);
865 dpsoftrast.commandpool.usedcommands = usedcommands;
868 #define DPSOFTRAST_ALIGNCOMMAND(size) \
869 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
870 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
871 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
873 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
875 DPSOFTRAST_Command *command;
876 int freecommand = dpsoftrast.commandpool.freecommand;
877 int usedcommands = dpsoftrast.commandpool.usedcommands;
878 int extra = sizeof(DPSOFTRAST_Command);
879 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
880 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
881 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
883 if (dpsoftrast.usethreads)
884 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
886 DPSOFTRAST_Draw_FlushThreads();
887 freecommand = dpsoftrast.commandpool.freecommand;
888 usedcommands = dpsoftrast.commandpool.usedcommands;
890 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
892 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
893 command->opcode = DPSOFTRAST_OPCODE_Reset;
894 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
897 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
898 command->opcode = opcode;
899 command->commandsize = size;
901 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
903 dpsoftrast.commandpool.freecommand = freecommand;
904 dpsoftrast.commandpool.usedcommands = usedcommands + size;
908 static void DPSOFTRAST_UndoCommand(int size)
910 int freecommand = dpsoftrast.commandpool.freecommand;
911 int usedcommands = dpsoftrast.commandpool.usedcommands;
914 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
915 usedcommands -= size;
916 dpsoftrast.commandpool.freecommand = freecommand;
917 dpsoftrast.commandpool.usedcommands = usedcommands;
920 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
921 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
923 thread->viewport[0] = command->x;
924 thread->viewport[1] = command->y;
925 thread->viewport[2] = command->width;
926 thread->viewport[3] = command->height;
927 thread->validate |= DPSOFTRAST_VALIDATE_FB;
929 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
931 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
934 command->width = width;
935 command->height = height;
937 dpsoftrast.viewport[0] = x;
938 dpsoftrast.viewport[1] = y;
939 dpsoftrast.viewport[2] = width;
940 dpsoftrast.viewport[3] = height;
941 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
944 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
945 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
947 int i, x1, y1, x2, y2, w, h, x, y;
948 int miny1, maxy1, miny2, maxy2;
952 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
953 miny1 = thread->miny1;
954 maxy1 = thread->maxy1;
955 miny2 = thread->miny2;
956 maxy2 = thread->maxy2;
957 x1 = thread->fb_scissor[0];
958 y1 = thread->fb_scissor[1];
959 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
960 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
961 if (y1 < miny1) y1 = miny1;
962 if (y2 > maxy2) y2 = maxy2;
967 // FIXME: honor fb_colormask?
968 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
969 for (i = 0;i < 4;i++)
971 if (!dpsoftrast.fb_colorpixels[i])
973 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
976 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
977 for (x = x1;x < x2;x++)
982 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
984 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
991 DEFCOMMAND(3, ClearDepth, float depth;)
992 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
994 int x1, y1, x2, y2, w, h, x, y;
995 int miny1, maxy1, miny2, maxy2;
999 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
1000 miny1 = thread->miny1;
1001 maxy1 = thread->maxy1;
1002 miny2 = thread->miny2;
1003 maxy2 = thread->maxy2;
1004 x1 = thread->fb_scissor[0];
1005 y1 = thread->fb_scissor[1];
1006 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
1007 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
1008 if (y1 < miny1) y1 = miny1;
1009 if (y2 > maxy2) y2 = maxy2;
1014 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
1015 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
1016 for (;y < bandy;y++)
1018 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
1019 for (x = x1;x < x2;x++)
1023 void DPSOFTRAST_ClearDepth(float d)
1025 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1029 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1030 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1032 thread->colormask[0] = command->r != 0;
1033 thread->colormask[1] = command->g != 0;
1034 thread->colormask[2] = command->b != 0;
1035 thread->colormask[3] = command->a != 0;
1036 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1038 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1040 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1047 DEFCOMMAND(5, DepthTest, int enable;)
1048 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1050 thread->depthtest = command->enable;
1051 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1053 void DPSOFTRAST_DepthTest(int enable)
1055 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1056 command->enable = enable;
1059 DEFCOMMAND(6, ScissorTest, int enable;)
1060 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1062 thread->scissortest = command->enable;
1063 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1065 void DPSOFTRAST_ScissorTest(int enable)
1067 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1068 command->enable = enable;
1071 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1072 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1074 thread->scissor[0] = command->x;
1075 thread->scissor[1] = command->y;
1076 thread->scissor[2] = command->width;
1077 thread->scissor[3] = command->height;
1078 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1080 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1082 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1085 command->width = width;
1086 command->height = height;
1089 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1090 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1092 thread->blendfunc[0] = command->sfactor;
1093 thread->blendfunc[1] = command->dfactor;
1094 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1096 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1098 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1099 command->sfactor = sfactor;
1100 command->dfactor = dfactor;
1103 DEFCOMMAND(9, BlendSubtract, int enable;)
1104 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1106 thread->blendsubtract = command->enable;
1107 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1109 void DPSOFTRAST_BlendSubtract(int enable)
1111 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1112 command->enable = enable;
1115 DEFCOMMAND(10, DepthMask, int enable;)
1116 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1118 thread->depthmask = command->enable;
1120 void DPSOFTRAST_DepthMask(int enable)
1122 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1123 command->enable = enable;
1126 DEFCOMMAND(11, DepthFunc, int func;)
1127 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1129 thread->depthfunc = command->func;
1131 void DPSOFTRAST_DepthFunc(int func)
1133 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1134 command->func = func;
1137 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1138 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1140 thread->depthrange[0] = command->nearval;
1141 thread->depthrange[1] = command->farval;
1143 void DPSOFTRAST_DepthRange(float nearval, float farval)
1145 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1146 command->nearval = nearval;
1147 command->farval = farval;
1150 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1151 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1153 thread->polygonoffset[0] = command->alongnormal;
1154 thread->polygonoffset[1] = command->intoview;
1156 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1158 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1159 command->alongnormal = alongnormal;
1160 command->intoview = intoview;
1163 DEFCOMMAND(14, CullFace, int mode;)
1164 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1166 thread->cullface = command->mode;
1168 void DPSOFTRAST_CullFace(int mode)
1170 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1171 command->mode = mode;
1174 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1176 dpsoftrast.color[0] = r;
1177 dpsoftrast.color[1] = g;
1178 dpsoftrast.color[2] = b;
1179 dpsoftrast.color[3] = a;
1182 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1184 int outstride = blockwidth * 4;
1185 int instride = dpsoftrast.fb_width * 4;
1188 int bx2 = blockx + blockwidth;
1189 int by2 = blocky + blockheight;
1193 unsigned char *inpixels;
1197 if (bx1 < 0) bx1 = 0;
1198 if (by1 < 0) by1 = 0;
1199 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1200 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1202 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1203 if (dpsoftrast.bigendian)
1205 for (y = by1;y < by2;y++)
1207 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1208 o = (unsigned char *)outpixels + (y - by1) * outstride;
1209 for (x = bx1;x < bx2;x++)
1222 for (y = by1;y < by2;y++)
1224 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1225 o = (unsigned char *)outpixels + (y - by1) * outstride;
1231 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1235 int tx2 = tx + width;
1236 int ty2 = ty + height;
1239 int sx2 = sx + width;
1240 int sy2 = sy + height;
1250 unsigned int *spixels;
1251 unsigned int *tpixels;
1252 DPSOFTRAST_Texture *texture;
1253 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1254 if (mip < 0 || mip >= texture->mipmaps) return;
1256 spixels = dpsoftrast.fb_colorpixels[0];
1257 swidth = dpsoftrast.fb_width;
1258 sheight = dpsoftrast.fb_height;
1259 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1260 twidth = texture->mipmap[mip][2];
1261 theight = texture->mipmap[mip][3];
1262 if (tx1 < 0) tx1 = 0;
1263 if (ty1 < 0) ty1 = 0;
1264 if (tx2 > twidth) tx2 = twidth;
1265 if (ty2 > theight) ty2 = theight;
1266 if (sx1 < 0) sx1 = 0;
1267 if (sy1 < 0) sy1 = 0;
1268 if (sx2 > swidth) sx2 = swidth;
1269 if (sy2 > sheight) sy2 = sheight;
1274 if (tw > sw) tw = sw;
1275 if (th > sh) th = sh;
1276 if (tw < 1 || th < 1)
1278 sy1 = sheight - sy1 - th;
1279 ty1 = theight - ty1 - th;
1280 for (y = 0;y < th;y++)
1281 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1282 if (texture->mipmaps > 1)
1283 DPSOFTRAST_Texture_CalculateMipmaps(index);
1286 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1287 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1289 if (thread->texbound[command->unitnum])
1290 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1291 thread->texbound[command->unitnum] = command->texture;
1293 void DPSOFTRAST_SetTexture(int unitnum, int index)
1295 DPSOFTRAST_Command_SetTexture *command;
1296 DPSOFTRAST_Texture *texture;
1297 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1299 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1302 texture = DPSOFTRAST_Texture_GetByIndex(index);
1303 if (index && !texture)
1305 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1309 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1310 command->unitnum = unitnum;
1311 command->texture = texture;
1313 dpsoftrast.texbound[unitnum] = texture;
1315 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1318 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1320 dpsoftrast.pointer_vertex3f = vertex3f;
1321 dpsoftrast.stride_vertex = stride;
1323 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1325 dpsoftrast.pointer_color4f = color4f;
1326 dpsoftrast.pointer_color4ub = NULL;
1327 dpsoftrast.stride_color = stride;
1329 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1331 dpsoftrast.pointer_color4f = NULL;
1332 dpsoftrast.pointer_color4ub = color4ub;
1333 dpsoftrast.stride_color = stride;
1335 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1337 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1338 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1339 dpsoftrast.stride_texcoord[unitnum] = stride;
1342 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1343 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1345 thread->shader_mode = command->mode;
1346 thread->shader_permutation = command->permutation;
1347 thread->shader_exactspecularmath = command->exactspecularmath;
1349 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1351 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1352 command->mode = mode;
1353 command->permutation = permutation;
1354 command->exactspecularmath = exactspecularmath;
1356 dpsoftrast.shader_mode = mode;
1357 dpsoftrast.shader_permutation = permutation;
1358 dpsoftrast.shader_exactspecularmath = exactspecularmath;
1361 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1362 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1364 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1366 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1368 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1369 command->index = index;
1370 command->val[0] = v0;
1371 command->val[1] = v1;
1372 command->val[2] = v2;
1373 command->val[3] = v3;
1375 dpsoftrast.uniform4f[index*4+0] = v0;
1376 dpsoftrast.uniform4f[index*4+1] = v1;
1377 dpsoftrast.uniform4f[index*4+2] = v2;
1378 dpsoftrast.uniform4f[index*4+3] = v3;
1380 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1382 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1383 command->index = index;
1384 memcpy(command->val, v, sizeof(command->val));
1386 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1389 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1390 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1392 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1394 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1398 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1400 __m128 m0, m1, m2, m3;
1401 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1402 command->index = (DPSOFTRAST_UNIFORM)index;
1403 if (((size_t)v)&(ALIGN_SIZE-1))
1405 m0 = _mm_loadu_ps(v);
1406 m1 = _mm_loadu_ps(v+4);
1407 m2 = _mm_loadu_ps(v+8);
1408 m3 = _mm_loadu_ps(v+12);
1412 m0 = _mm_load_ps(v);
1413 m1 = _mm_load_ps(v+4);
1414 m2 = _mm_load_ps(v+8);
1415 m3 = _mm_load_ps(v+12);
1419 __m128 t0, t1, t2, t3;
1420 t0 = _mm_unpacklo_ps(m0, m1);
1421 t1 = _mm_unpacklo_ps(m2, m3);
1422 t2 = _mm_unpackhi_ps(m0, m1);
1423 t3 = _mm_unpackhi_ps(m2, m3);
1424 m0 = _mm_movelh_ps(t0, t1);
1425 m1 = _mm_movehl_ps(t1, t0);
1426 m2 = _mm_movelh_ps(t2, t3);
1427 m3 = _mm_movehl_ps(t3, t2);
1429 _mm_store_ps(command->val, m0);
1430 _mm_store_ps(command->val+4, m1);
1431 _mm_store_ps(command->val+8, m2);
1432 _mm_store_ps(command->val+12, m3);
1433 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1434 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1435 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1436 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1441 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1442 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1444 thread->uniform1i[command->index] = command->val;
1446 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1448 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1449 command->index = index;
1452 dpsoftrast.uniform1i[command->index] = i0;
1455 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1456 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1458 memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1459 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1461 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1463 DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1464 command->clipplane[0] = x;
1465 command->clipplane[1] = y;
1466 command->clipplane[2] = z;
1467 command->clipplane[3] = w;
1471 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1473 float *end = dst + size*4;
1474 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1478 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1487 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1494 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1496 float *end = dst + size*4;
1497 if (stride == sizeof(float[3]))
1499 float *end4 = dst + (size&~3)*4;
1500 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1504 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1505 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1506 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1507 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1508 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1509 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1510 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1511 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1512 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1513 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1514 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1515 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1516 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1518 src += 4*sizeof(float[3]);
1525 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1526 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1527 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1528 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1529 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1530 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1531 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1532 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1533 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1534 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1535 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1536 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1537 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1539 src += 4*sizeof(float[3]);
1543 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1547 __m128 v = _mm_loadu_ps((const float *)src);
1548 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1549 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1550 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1551 _mm_store_ps(dst, v);
1560 __m128 v = _mm_load_ps((const float *)src);
1561 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1562 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1563 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1564 _mm_store_ps(dst, v);
1571 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1573 float *end = dst + size*4;
1574 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1575 if (stride == sizeof(float[2]))
1577 float *end2 = dst + (size&~1)*4;
1578 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1582 __m128 v = _mm_loadu_ps((const float *)src);
1583 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1584 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1586 src += 2*sizeof(float[2]);
1593 __m128 v = _mm_load_ps((const float *)src);
1594 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1595 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1597 src += 2*sizeof(float[2]);
1603 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1609 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1611 float *end = dst + size*4;
1612 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1613 if (stride == sizeof(unsigned char[4]))
1615 float *end4 = dst + (size&~3)*4;
1616 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1620 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1621 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1622 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1623 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1624 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1626 src += 4*sizeof(unsigned char[4]);
1633 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1634 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1635 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1636 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1637 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1639 src += 4*sizeof(unsigned char[4]);
1645 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1646 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1652 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1654 float *end = dst + 4*size;
1655 __m128 v = _mm_loadu_ps(src);
1658 _mm_store_ps(dst, v);
1664 static void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1667 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1668 __m128 m0, m1, m2, m3;
1670 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1672 // fast case for identity matrix
1673 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1676 end = out4f + numitems*4;
1677 m0 = _mm_loadu_ps(inmatrix16f);
1678 m1 = _mm_loadu_ps(inmatrix16f + 4);
1679 m2 = _mm_loadu_ps(inmatrix16f + 8);
1680 m3 = _mm_loadu_ps(inmatrix16f + 12);
1681 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1685 __m128 v = _mm_loadu_ps(in4f);
1687 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1688 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1689 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1690 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1699 __m128 v = _mm_load_ps(in4f);
1701 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1702 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1703 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1704 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1713 static void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1715 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1720 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1722 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1723 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1724 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1725 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1728 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1730 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1731 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1732 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1733 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1736 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1739 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1740 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1741 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1742 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1745 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1747 int clipmask = 0xFF;
1748 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1749 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1750 __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1751 __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1752 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1753 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1754 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1755 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1756 #define BBFRONT(k, pos) \
1758 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1759 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1760 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1763 clipmask &= ~(1<<k); \
1764 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1765 minproj = _mm_min_ss(minproj, proj); \
1766 maxproj = _mm_max_ss(maxproj, proj); \
1770 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1771 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1772 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1773 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1774 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1775 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1779 if (clipmask&(1<<k)) \
1781 if (!(clipmask&(1<<(k^1)))) \
1783 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1784 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1785 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1786 minproj = _mm_min_ss(minproj, proj); \
1787 maxproj = _mm_max_ss(maxproj, proj); \
1789 if (!(clipmask&(1<<(k^2)))) \
1791 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1792 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1793 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1794 minproj = _mm_min_ss(minproj, proj); \
1795 maxproj = _mm_max_ss(maxproj, proj); \
1797 if (!(clipmask&(1<<(k^4)))) \
1799 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1800 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1801 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1802 minproj = _mm_min_ss(minproj, proj); \
1803 maxproj = _mm_max_ss(maxproj, proj); \
1807 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1808 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1809 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1810 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1811 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1812 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1813 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1814 *starty = _mm_cvttss_si32(maxproj);
1815 *endy = _mm_cvttss_si32(minproj)+1;
1819 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1821 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1822 float *end = out4f + numitems*4;
1823 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1824 __m128 minpos, maxpos;
1825 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1827 minpos = maxpos = _mm_loadu_ps(in4f);
1830 __m128 v = _mm_loadu_ps(in4f);
1831 minpos = _mm_min_ps(minpos, v);
1832 maxpos = _mm_max_ps(maxpos, v);
1833 _mm_store_ps(out4f, v);
1834 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1835 _mm_store_ps(screen4f, v);
1843 minpos = maxpos = _mm_load_ps(in4f);
1846 __m128 v = _mm_load_ps(in4f);
1847 minpos = _mm_min_ps(minpos, v);
1848 maxpos = _mm_max_ps(maxpos, v);
1849 _mm_store_ps(out4f, v);
1850 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1851 _mm_store_ps(screen4f, v);
1859 ALIGN(float minposf[4]);
1860 ALIGN(float maxposf[4]);
1861 _mm_store_ps(minposf, minpos);
1862 _mm_store_ps(maxposf, maxpos);
1863 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1868 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1870 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1871 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1873 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1874 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1875 end = out4f + numitems*4;
1876 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1877 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1878 m0 = _mm_loadu_ps(inmatrix16f);
1879 m1 = _mm_loadu_ps(inmatrix16f + 4);
1880 m2 = _mm_loadu_ps(inmatrix16f + 8);
1881 m3 = _mm_loadu_ps(inmatrix16f + 12);
1882 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1884 minpos = maxpos = _mm_loadu_ps(in4f);
1887 __m128 v = _mm_loadu_ps(in4f);
1888 minpos = _mm_min_ps(minpos, v);
1889 maxpos = _mm_max_ps(maxpos, v);
1890 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1891 _mm_store_ps(out4f, v);
1892 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1893 _mm_store_ps(screen4f, v);
1901 minpos = maxpos = _mm_load_ps(in4f);
1904 __m128 v = _mm_load_ps(in4f);
1905 minpos = _mm_min_ps(minpos, v);
1906 maxpos = _mm_max_ps(maxpos, v);
1907 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1908 _mm_store_ps(out4f, v);
1909 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1910 _mm_store_ps(screen4f, v);
1918 ALIGN(float minposf[4]);
1919 ALIGN(float maxposf[4]);
1920 _mm_store_ps(minposf, minpos);
1921 _mm_store_ps(maxposf, maxpos);
1922 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f);
1928 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1931 float *outf = dpsoftrast.post_array4f[outarray];
1932 const unsigned char *inb;
1933 int firstvertex = dpsoftrast.firstvertex;
1934 int numvertices = dpsoftrast.numvertices;
1938 case DPSOFTRAST_ARRAY_POSITION:
1939 stride = dpsoftrast.stride_vertex;
1940 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1941 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1943 case DPSOFTRAST_ARRAY_COLOR:
1944 stride = dpsoftrast.stride_color;
1945 if (dpsoftrast.pointer_color4f)
1947 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1948 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1950 else if (dpsoftrast.pointer_color4ub)
1952 stride = dpsoftrast.stride_color;
1953 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1954 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1958 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1962 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1963 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1965 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1966 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1969 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1972 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1975 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1987 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1989 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1990 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1995 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1998 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1999 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
2007 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2010 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2011 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2018 static void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2021 int startx = span->startx;
2022 int endx = span->endx;
2023 float wslope = triangle->w[0];
2024 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2025 float endz = 1.0f / (w + wslope * startx);
2026 if (triangle->w[0] == 0)
2028 // LordHavoc: fast flat polygons (HUD/menu)
2029 for (x = startx;x < endx;x++)
2033 for (x = startx;x < endx;)
2035 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2037 if (nextsub >= endx) nextsub = endsub = endx-1;
2038 endz = 1.0f / (w + wslope * nextsub);
2039 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2040 for (; x <= endsub; x++, z += dz)
2045 static void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2049 int startx = span->startx;
2050 int endx = span->endx;
2053 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2054 unsigned char * RESTRICT pixelmask = span->pixelmask;
2055 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2058 pixeli += span->y * dpsoftrast.fb_width + span->x;
2059 // handle alphatest now (this affects depth writes too)
2060 if (thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL)
2061 for (x = startx;x < endx;x++)
2062 if (in4ub[x*4+3] < 128)
2063 pixelmask[x] = false;
2064 // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2065 // helps sprites, text and hud artwork
2066 switch(thread->fb_blendmode)
2068 case DPSOFTRAST_BLENDMODE_ALPHA:
2069 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2070 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2072 for (x = startx;x < endx;x++)
2074 if (in4ub[x*4+3] >= 1)
2079 while (++x < endx && in4ub[x*4+3] >= 1) ;
2081 if (x >= endx) break;
2083 while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2084 if (x >= endx) break;
2091 case DPSOFTRAST_BLENDMODE_OPAQUE:
2092 case DPSOFTRAST_BLENDMODE_ADD:
2093 case DPSOFTRAST_BLENDMODE_INVMOD:
2094 case DPSOFTRAST_BLENDMODE_MUL:
2095 case DPSOFTRAST_BLENDMODE_MUL2:
2096 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2097 case DPSOFTRAST_BLENDMODE_INVADD:
2100 // put some special values at the end of the mask to ensure the loops end
2101 pixelmask[endx] = 1;
2102 pixelmask[endx+1] = 0;
2103 // LordHavoc: use a double loop to identify subspans, this helps the
2104 // optimized copy/blend loops to perform at their best, most triangles
2105 // have only one run of pixels, and do the search using wide reads...
2109 // if this pixel is masked off, it's probably not alone...
2116 // the 4-item search must be aligned or else it stalls badly
2117 if ((x & 3) && !pixelmask[x])
2119 if(pixelmask[x]) goto endmasked;
2123 if(pixelmask[x]) goto endmasked;
2127 if(pixelmask[x]) goto endmasked;
2132 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2136 for (;!pixelmask[x];x++)
2138 // rather than continue the loop, just check the end variable
2143 // find length of subspan
2146 if (subx + 8 < endx)
2150 if(!pixelmask[subx]) goto endunmasked;
2154 if(!pixelmask[subx]) goto endunmasked;
2158 if(!pixelmask[subx]) goto endunmasked;
2163 while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2167 for (;pixelmask[subx];subx++)
2169 // the checks can overshoot, so make sure to clip it...
2173 // now that we know the subspan length... process!
2174 switch(thread->fb_blendmode)
2176 case DPSOFTRAST_BLENDMODE_OPAQUE:
2180 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2185 while (x + 16 <= subx)
2187 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2188 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2189 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2190 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2195 while (x + 4 <= subx)
2197 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2203 pixeli[x+1] = ini[x+1];
2213 case DPSOFTRAST_BLENDMODE_ALPHA:
2214 #define FINISHBLEND(blend2, blend1) \
2215 for (;x + 1 < subx;x += 2) \
2218 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2219 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2221 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2226 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2227 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2229 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2233 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2234 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2236 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2237 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2240 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2242 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2243 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2245 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2246 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2249 case DPSOFTRAST_BLENDMODE_ADD:
2250 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2252 case DPSOFTRAST_BLENDMODE_INVMOD:
2254 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2256 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2259 case DPSOFTRAST_BLENDMODE_MUL:
2260 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2262 case DPSOFTRAST_BLENDMODE_MUL2:
2263 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2265 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2267 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2268 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2270 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2271 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2274 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2276 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2277 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2279 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2280 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2283 case DPSOFTRAST_BLENDMODE_INVADD:
2285 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2287 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2295 static void DPSOFTRAST_Texture2DBGRA8(DPSOFTRAST_Texture *texture, int mip, float x, float y, unsigned char c[4])
2296 // warning: this is SLOW, only use if the optimized per-span functions won't do
2298 const unsigned char * RESTRICT pixelbase;
2299 const unsigned char * RESTRICT pixel[4];
2300 int width = texture->mipmap[mip][2], height = texture->mipmap[mip][3];
2301 int wrapmask[2] = { width-1, height-1 };
2302 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*width;
2303 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
2305 unsigned int tc[2] = { x * (width<<12) - 2048, y * (height<<12) - 2048};
2306 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2307 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2308 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2309 int tci[2] = { tc[0]>>12, tc[1]>>12 };
2310 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2311 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2313 tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2314 tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2315 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= wrapmask[0] ? tci1[0] : wrapmask[0]) : 0;
2316 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= wrapmask[1] ? tci1[1] : wrapmask[1]) : 0;
2320 tci[0] &= wrapmask[0];
2321 tci[1] &= wrapmask[1];
2322 tci1[0] &= wrapmask[0];
2323 tci1[1] &= wrapmask[1];
2325 pixel[0] = pixelbase + 4 * (tci[0] - tci[1]*width);
2326 pixel[1] = pixelbase + 4 * (tci[0] - tci[1]*width);
2327 pixel[2] = pixelbase + 4 * (tci[0] - tci1[1]*width);
2328 pixel[3] = pixelbase + 4 * (tci[0] - tci1[1]*width);
2329 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
2330 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
2331 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
2332 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
2336 int tci[2] = { x * width, y * height };
2337 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2339 tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2340 tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2344 tci[0] &= wrapmask[0];
2345 tci[1] &= wrapmask[1];
2347 pixel[0] = pixelbase + 4 * (tci[0] - tci[1]*width);
2356 static void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2359 int startx = span->startx;
2360 int endx = span->endx;
2365 float tc[2], endtc[2];
2367 unsigned int tci[2];
2368 unsigned int tci1[2];
2369 unsigned int tcimin[2];
2370 unsigned int tcimax[2];
2375 const unsigned char * RESTRICT pixelbase;
2376 const unsigned char * RESTRICT pixel[4];
2377 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2378 // if no texture is bound, just fill it with white
2381 for (x = startx;x < endx;x++)
2383 out4f[x*4+0] = 1.0f;
2384 out4f[x*4+1] = 1.0f;
2385 out4f[x*4+2] = 1.0f;
2386 out4f[x*4+3] = 1.0f;
2390 mip = triangle->mip[texunitindex];
2391 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*texture->mipmap[mip][2];
2392 // if this mipmap of the texture is 1 pixel, just fill it with that color
2393 if (texture->mipmap[mip][1] == 4)
2395 c[0] = texture->bytes[2] * (1.0f/255.0f);
2396 c[1] = texture->bytes[1] * (1.0f/255.0f);
2397 c[2] = texture->bytes[0] * (1.0f/255.0f);
2398 c[3] = texture->bytes[3] * (1.0f/255.0f);
2399 for (x = startx;x < endx;x++)
2401 out4f[x*4+0] = c[0];
2402 out4f[x*4+1] = c[1];
2403 out4f[x*4+2] = c[2];
2404 out4f[x*4+3] = c[3];
2408 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2409 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2410 flags = texture->flags;
2411 tcscale[0] = texture->mipmap[mip][2];
2412 tcscale[1] = texture->mipmap[mip][3];
2413 tciwidth = -texture->mipmap[mip][2];
2416 tcimax[0] = texture->mipmap[mip][2]-1;
2417 tcimax[1] = texture->mipmap[mip][3]-1;
2418 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2419 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2420 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2421 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2427 for (x = startx;x < endx;)
2429 unsigned int subtc[2];
2430 unsigned int substep[2];
2431 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2432 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2433 if (nextsub >= endx)
2435 nextsub = endsub = endx-1;
2436 if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2440 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2441 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2447 substep[0] = (endtc[0] - tc[0]) * subscale;
2448 substep[1] = (endtc[1] - tc[1]) * subscale;
2449 subtc[0] = tc[0] * (1<<12);
2450 subtc[1] = tc[1] * (1<<12);
2453 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2455 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2457 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2458 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2459 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2460 tci[0] = subtc[0]>>12;
2461 tci[1] = subtc[1]>>12;
2462 tci1[0] = tci[0] + 1;
2463 tci1[1] = tci[1] + 1;
2464 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2465 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2466 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2467 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2468 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2469 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2470 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2471 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2472 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2473 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2474 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2475 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2476 out4f[x*4+0] = c[0];
2477 out4f[x*4+1] = c[1];
2478 out4f[x*4+2] = c[2];
2479 out4f[x*4+3] = c[3];
2484 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2486 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2487 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2488 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2489 tci[0] = subtc[0]>>12;
2490 tci[1] = subtc[1]>>12;
2491 tci1[0] = tci[0] + 1;
2492 tci1[1] = tci[1] + 1;
2493 tci[0] &= tciwrapmask[0];
2494 tci[1] &= tciwrapmask[1];
2495 tci1[0] &= tciwrapmask[0];
2496 tci1[1] &= tciwrapmask[1];
2497 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2498 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2499 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2500 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2501 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2502 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2503 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2504 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2505 out4f[x*4+0] = c[0];
2506 out4f[x*4+1] = c[1];
2507 out4f[x*4+2] = c[2];
2508 out4f[x*4+3] = c[3];
2512 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2514 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2516 tci[0] = subtc[0]>>12;
2517 tci[1] = subtc[1]>>12;
2518 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2519 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2520 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2521 c[0] = pixel[0][2] * (1.0f / 255.0f);
2522 c[1] = pixel[0][1] * (1.0f / 255.0f);
2523 c[2] = pixel[0][0] * (1.0f / 255.0f);
2524 c[3] = pixel[0][3] * (1.0f / 255.0f);
2525 out4f[x*4+0] = c[0];
2526 out4f[x*4+1] = c[1];
2527 out4f[x*4+2] = c[2];
2528 out4f[x*4+3] = c[3];
2533 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2535 tci[0] = subtc[0]>>12;
2536 tci[1] = subtc[1]>>12;
2537 tci[0] &= tciwrapmask[0];
2538 tci[1] &= tciwrapmask[1];
2539 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2540 c[0] = pixel[0][2] * (1.0f / 255.0f);
2541 c[1] = pixel[0][1] * (1.0f / 255.0f);
2542 c[2] = pixel[0][0] * (1.0f / 255.0f);
2543 c[3] = pixel[0][3] * (1.0f / 255.0f);
2544 out4f[x*4+0] = c[0];
2545 out4f[x*4+1] = c[1];
2546 out4f[x*4+2] = c[2];
2547 out4f[x*4+3] = c[3];
2554 static void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2558 int startx = span->startx;
2559 int endx = span->endx;
2561 __m128 data, slope, tcscale;
2562 __m128i tcsize, tcmask, tcoffset, tcmax;
2564 __m128i subtc, substep, endsubtc;
2567 int affine; // LordHavoc: optimized affine texturing case
2568 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2569 const unsigned char * RESTRICT pixelbase;
2570 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2571 // if no texture is bound, just fill it with white
2574 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2577 mip = triangle->mip[texunitindex];
2578 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*texture->mipmap[mip][2];
2579 // if this mipmap of the texture is 1 pixel, just fill it with that color
2580 if (texture->mipmap[mip][1] == 4)
2582 unsigned int k = *((const unsigned int *)pixelbase);
2583 for (x = startx;x < endx;x++)
2587 affine = zf[startx] == zf[endx-1];
2588 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2589 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2590 flags = texture->flags;
2591 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2592 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2593 tcscale = _mm_cvtepi32_ps(tcsize);
2594 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2595 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2596 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2598 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2599 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2600 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_sub_epi32(_mm_setzero_si128(), _mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0))), 18), _mm_set1_epi32(4));
2601 tcmax = _mm_packs_epi32(tcmask, tcmask);
2602 for (x = startx;x < endx;)
2604 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2605 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2606 if (nextsub >= endx || affine)
2608 nextsub = endsub = endx-1;
2609 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2613 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2615 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2616 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2617 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2618 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2619 substep = _mm_slli_epi32(substep, 1);
2622 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2623 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2625 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2626 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2628 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2629 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2630 tci = _mm_madd_epi16(tci, tcoffset);
2631 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2632 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2633 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2634 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2635 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2636 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2637 fracm = _mm_srli_epi16(subtc, 1);
2638 pix1 = _mm_add_epi16(pix1,
2639 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2640 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2641 pix3 = _mm_add_epi16(pix3,
2642 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2643 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2644 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2645 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2646 pix2 = _mm_add_epi16(pix2,
2647 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2648 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2649 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2653 const unsigned char * RESTRICT ptr1;
2654 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2655 tci = _mm_madd_epi16(tci, tcoffset);
2656 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2657 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2658 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2659 fracm = _mm_srli_epi16(subtc, 1);
2660 pix1 = _mm_add_epi16(pix1,
2661 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2662 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2663 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2664 pix1 = _mm_add_epi16(pix1,
2665 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2666 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2667 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2671 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2673 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2675 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2676 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2677 tci = _mm_madd_epi16(tci, tcoffset);
2678 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2679 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2680 _mm_setzero_si128());
2681 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2682 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2683 _mm_setzero_si128());
2684 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2685 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2686 tci = _mm_madd_epi16(tci, tcoffset);
2687 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2688 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2689 _mm_setzero_si128());
2690 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2691 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2692 _mm_setzero_si128());
2693 fracm = _mm_srli_epi16(subtc, 1);
2694 pix1 = _mm_add_epi16(pix1,
2695 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2696 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2697 pix3 = _mm_add_epi16(pix3,
2698 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2699 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2700 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2701 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2702 pix2 = _mm_add_epi16(pix2,
2703 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2704 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2705 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2709 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2710 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2711 tci = _mm_madd_epi16(tci, tcoffset);
2712 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2713 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2714 _mm_setzero_si128());
2715 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2716 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2717 _mm_setzero_si128());
2718 fracm = _mm_srli_epi16(subtc, 1);
2719 pix1 = _mm_add_epi16(pix1,
2720 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2721 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2722 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2723 pix1 = _mm_add_epi16(pix1,
2724 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2725 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2726 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2732 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2734 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2735 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2736 tci = _mm_madd_epi16(tci, tcoffset);
2737 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2738 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2739 _mm_setzero_si128());
2740 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2741 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2742 _mm_setzero_si128());
2743 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2744 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2745 tci = _mm_madd_epi16(tci, tcoffset);
2746 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2747 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2748 _mm_setzero_si128());
2749 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2750 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2751 _mm_setzero_si128());
2752 fracm = _mm_srli_epi16(subtc, 1);
2753 pix1 = _mm_add_epi16(pix1,
2754 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2755 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2756 pix3 = _mm_add_epi16(pix3,
2757 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2758 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2759 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2760 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2761 pix2 = _mm_add_epi16(pix2,
2762 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2763 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2764 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2768 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2769 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2770 tci = _mm_madd_epi16(tci, tcoffset);
2771 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2772 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2773 _mm_setzero_si128());
2774 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2775 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2776 _mm_setzero_si128());
2777 fracm = _mm_srli_epi16(subtc, 1);
2778 pix1 = _mm_add_epi16(pix1,
2779 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2780 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2781 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2782 pix1 = _mm_add_epi16(pix1,
2783 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2784 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2785 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2792 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2794 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2796 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2797 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2798 tci = _mm_madd_epi16(tci, tcoffset);
2799 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2800 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2804 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2805 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2806 tci = _mm_madd_epi16(tci, tcoffset);
2807 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2813 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2815 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2816 tci = _mm_and_si128(tci, tcmax);
2817 tci = _mm_madd_epi16(tci, tcoffset);
2818 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2819 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2823 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2824 tci = _mm_and_si128(tci, tcmax);
2825 tci = _mm_madd_epi16(tci, tcoffset);
2826 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2835 static void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2838 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2841 static float DPSOFTRAST_SampleShadowmap(const float *vector)
2848 static void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2851 int startx = span->startx;
2852 int endx = span->endx;
2857 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2858 for (x = startx;x < endx;x++)
2861 c[0] = (data[0] + slope[0]*x) * z;
2862 c[1] = (data[1] + slope[1]*x) * z;
2863 c[2] = (data[2] + slope[2]*x) * z;
2864 c[3] = (data[3] + slope[3]*x) * z;
2865 out4f[x*4+0] = in4f[x*4+0] * c[0];
2866 out4f[x*4+1] = in4f[x*4+1] * c[1];
2867 out4f[x*4+2] = in4f[x*4+2] * c[2];
2868 out4f[x*4+3] = in4f[x*4+3] * c[3];
2874 static void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2877 int startx = span->startx;
2878 int endx = span->endx;
2883 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2884 for (x = startx;x < endx;x++)
2887 c[0] = (data[0] + slope[0]*x) * z;
2888 c[1] = (data[1] + slope[1]*x) * z;
2889 c[2] = (data[2] + slope[2]*x) * z;
2890 c[3] = (data[3] + slope[3]*x) * z;
2891 out4f[x*4+0] = c[0];
2892 out4f[x*4+1] = c[1];
2893 out4f[x*4+2] = c[2];
2894 out4f[x*4+3] = c[3];
2900 static void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2902 int x, startx = span->startx, endx = span->endx;
2903 float c[4], localcolor[4];
2904 localcolor[0] = subcolor[0];
2905 localcolor[1] = subcolor[1];
2906 localcolor[2] = subcolor[2];
2907 localcolor[3] = subcolor[3];
2908 for (x = startx;x < endx;x++)
2910 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2911 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2912 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2913 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2914 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2915 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2916 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2917 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2923 static void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2925 int x, startx = span->startx, endx = span->endx;
2926 for (x = startx;x < endx;x++)
2928 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2929 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2930 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2931 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2937 static void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2939 int x, startx = span->startx, endx = span->endx;
2940 for (x = startx;x < endx;x++)
2942 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2943 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2944 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2945 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2951 static void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2953 int x, startx = span->startx, endx = span->endx;
2955 for (x = startx;x < endx;x++)
2957 a = 1.0f - inb4f[x*4+3];
2959 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2960 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2961 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2962 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2968 static void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2970 int x, startx = span->startx, endx = span->endx;
2971 float localcolor[4], ilerp, lerp;
2972 localcolor[0] = color[0];
2973 localcolor[1] = color[1];
2974 localcolor[2] = color[2];
2975 localcolor[3] = color[3];
2976 ilerp = 1.0f - localcolor[3];
2977 lerp = localcolor[3];
2978 for (x = startx;x < endx;x++)
2980 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2981 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2982 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2983 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2990 static void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2994 int startx = span->startx;
2995 int endx = span->endx;
2998 __m128i submod, substep, endsubmod;
2999 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3000 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3001 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3002 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3003 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3004 for (x = startx; x < endx;)
3006 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3007 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3008 if (nextsub >= endx)
3010 nextsub = endsub = endx-1;
3011 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
3015 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3016 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3017 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3018 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3019 substep = _mm_packs_epi32(substep, substep);
3020 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3022 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3023 pix = _mm_mulhi_epu16(pix, submod);
3024 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3028 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3029 pix = _mm_mulhi_epu16(pix, submod);
3030 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3037 static void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3041 int startx = span->startx;
3042 int endx = span->endx;
3045 __m128i submod, substep, endsubmod;
3046 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3047 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3048 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3049 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3050 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3051 for (x = startx; x < endx;)
3053 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3054 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3055 if (nextsub >= endx)
3057 nextsub = endsub = endx-1;
3058 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3062 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3063 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3064 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3065 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3066 substep = _mm_packs_epi32(substep, substep);
3067 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3069 __m128i pix = _mm_srai_epi16(submod, 4);
3070 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3074 __m128i pix = _mm_srai_epi16(submod, 4);
3075 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3082 static void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3085 int x, startx = span->startx, endx = span->endx;
3086 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3087 localcolor = _mm_packs_epi32(localcolor, localcolor);
3088 for (x = startx;x+2 <= endx;x+=2)
3090 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3091 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3092 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3093 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3097 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3098 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3099 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3100 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3105 static void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3108 int x, startx = span->startx, endx = span->endx;
3109 for (x = startx;x+2 <= endx;x+=2)
3111 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3112 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3113 pix1 = _mm_mulhi_epu16(pix1, pix2);
3114 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3118 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3119 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3120 pix1 = _mm_mulhi_epu16(pix1, pix2);
3121 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3126 static void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3129 int x, startx = span->startx, endx = span->endx;
3130 for (x = startx;x+2 <= endx;x+=2)
3132 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3133 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3134 pix1 = _mm_add_epi16(pix1, pix2);
3135 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3139 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3140 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3141 pix1 = _mm_add_epi16(pix1, pix2);
3142 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3148 static void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3151 int x, startx = span->startx, endx = span->endx;
3152 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3153 tint = _mm_packs_epi32(tint, tint);
3154 for (x = startx;x+2 <= endx;x+=2)
3156 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3157 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3158 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3159 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3163 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3164 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3165 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3166 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3172 static void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3175 int x, startx = span->startx, endx = span->endx;
3176 for (x = startx;x+2 <= endx;x+=2)
3178 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3179 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3180 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3181 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3182 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3186 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3187 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3188 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3189 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3190 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3195 static void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3198 int x, startx = span->startx, endx = span->endx;
3199 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3200 localcolor = _mm_packs_epi32(localcolor, localcolor);
3201 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3202 for (x = startx;x+2 <= endx;x+=2)
3204 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3205 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3206 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3210 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3211 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3212 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3219 static void DPSOFTRAST_VertexShader_Generic(void)
3221 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3222 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3223 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3224 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3225 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3228 static void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3230 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3231 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3232 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3233 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3234 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3235 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3237 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3238 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3239 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3241 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3242 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3245 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3247 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3250 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3252 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3255 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3260 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3261 if(thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL)
3264 for (x = span->startx;x < span->endx;x++)
3265 buffer_FragColorbgra8[x*4+3] = buffer_FragColorbgra8[x*4+3] * thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3267 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3272 static void DPSOFTRAST_VertexShader_PostProcess(void)
3274 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3275 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3276 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3279 static void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3281 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3282 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3283 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3284 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3285 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3286 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3287 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3289 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3290 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3292 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3293 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3295 // TODO: implement saturation
3297 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3299 // TODO: implement gammaramps
3301 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3306 static void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3308 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3311 static void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3313 // this is never called (because colormask is off when this shader is used)
3314 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3315 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3316 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3317 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3318 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3323 static void DPSOFTRAST_VertexShader_FlatColor(void)
3325 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3326 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3329 static void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3332 unsigned char * RESTRICT pixelmask = span->pixelmask;
3333 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3334 int x, startx = span->startx, endx = span->endx;
3335 __m128i Color_Ambientm;
3336 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3337 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3338 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3339 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3340 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3341 if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3342 pixel = buffer_FragColorbgra8;
3343 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3344 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3345 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3346 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3347 for (x = startx;x < endx;x++)
3350 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3353 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3354 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3355 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3356 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3362 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3363 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3364 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3366 if (pixel == buffer_FragColorbgra8)
3367 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3373 static void DPSOFTRAST_VertexShader_VertexColor(void)
3375 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3376 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3377 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3380 static void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3383 unsigned char * RESTRICT pixelmask = span->pixelmask;
3384 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3385 int x, startx = span->startx, endx = span->endx;
3386 __m128i Color_Ambientm, Color_Diffusem;
3388 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3389 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3390 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3391 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3392 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3393 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3394 if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3395 pixel = buffer_FragColorbgra8;
3396 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3397 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3398 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3399 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3400 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3401 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3402 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3403 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3404 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3405 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3406 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3407 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3408 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3409 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3411 __m128i color, mod, pix;
3412 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3415 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3416 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3417 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3418 data = _mm_add_ps(data, slope);
3419 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3420 data = _mm_add_ps(data, slope);
3421 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3422 data = _mm_add_ps(data, slope);
3423 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3424 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3425 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3426 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3427 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3428 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3434 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3435 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3436 mod = _mm_packs_epi32(mod, mod);
3437 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3438 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3440 if (pixel == buffer_FragColorbgra8)
3441 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3447 static void DPSOFTRAST_VertexShader_Lightmap(void)
3449 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3450 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3451 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3454 static void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3457 unsigned char * RESTRICT pixelmask = span->pixelmask;
3458 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3459 int x, startx = span->startx, endx = span->endx;
3460 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3461 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3462 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3463 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3464 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3465 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3466 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3467 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3468 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3469 if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3470 pixel = buffer_FragColorbgra8;
3471 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3472 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3473 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3474 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3475 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3476 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3477 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3478 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3480 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3481 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3482 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3483 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3484 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3485 for (x = startx;x < endx;x++)
3487 __m128i color, lightmap, glow, pix;
3488 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3491 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3492 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3493 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3494 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3495 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3496 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3497 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3498 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3499 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3500 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3506 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3507 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3508 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3509 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3510 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3511 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3516 for (x = startx;x < endx;x++)
3518 __m128i color, lightmap, pix;
3519 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3522 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3523 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3524 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3525 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3526 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3527 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3528 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3534 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3535 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3536 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3537 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3540 if (pixel == buffer_FragColorbgra8)
3541 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3546 void DPSOFTRAST_VertexShader_LightDirection(void);
3547 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3549 static void DPSOFTRAST_VertexShader_FakeLight(void)
3551 DPSOFTRAST_VertexShader_LightDirection();
3554 static void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3556 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3561 static void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3563 DPSOFTRAST_VertexShader_LightDirection();
3564 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3567 static void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3569 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3574 static void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3576 DPSOFTRAST_VertexShader_LightDirection();
3577 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3580 static void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3582 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3587 void DPSOFTRAST_VertexShader_LightDirection(void)
3590 int numvertices = dpsoftrast.numvertices;
3592 float LightVector[4];
3593 float EyePosition[4];
3594 float EyeVectorModelSpace[4];
3600 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3601 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3602 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3603 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3604 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3605 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3606 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3607 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3608 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3609 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3610 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3611 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3612 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3613 for (i = 0;i < numvertices;i++)
3615 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3616 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3617 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3618 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3619 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3620 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3621 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3622 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3623 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3624 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3625 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3626 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3627 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3628 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3629 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3630 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3631 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3632 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3633 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3634 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3635 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3636 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3637 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3638 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3639 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3640 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3641 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3642 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3643 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3645 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3648 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3649 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3650 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3651 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3652 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3653 #define DPSOFTRAST_Vector3Normalize(v)\
3656 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3667 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3669 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3670 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3671 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3672 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3673 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3674 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3675 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3676 unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3677 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3678 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3679 int x, startx = span->startx, endx = span->endx;
3680 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3681 float LightVectordata[4];
3682 float LightVectorslope[4];
3683 float EyeVectordata[4];
3684 float EyeVectorslope[4];
3685 float VectorSdata[4];
3686 float VectorSslope[4];
3687 float VectorTdata[4];
3688 float VectorTslope[4];
3689 float VectorRdata[4];
3690 float VectorRslope[4];
3692 float diffusetex[4];
3694 float surfacenormal[4];
3695 float lightnormal[4];
3696 float lightnormal_modelspace[4];
3698 float specularnormal[4];
3701 float SpecularPower;
3703 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3704 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3705 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3706 Color_Glow[3] = 0.0f;
3707 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3708 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3709 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3710 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3711 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3712 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3713 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3714 Color_Pants[3] = 0.0f;
3715 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3716 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3717 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3718 Color_Shirt[3] = 0.0f;
3719 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3720 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3721 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3723 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3724 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3726 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3728 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3730 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3732 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3733 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3734 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3735 Color_Diffuse[3] = 0.0f;
3736 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3737 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3738 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3739 LightColor[3] = 0.0f;
3740 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3741 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3742 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3743 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3744 Color_Specular[3] = 0.0f;
3745 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3746 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3747 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3749 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3751 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3752 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3753 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3754 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3755 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3757 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3759 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3760 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3762 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3764 // nothing of this needed
3768 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3771 for (x = startx;x < endx;x++)
3774 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3775 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3776 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3777 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3778 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3780 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3781 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3782 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3783 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3785 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3786 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3787 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3788 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3789 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3790 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3791 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3792 DPSOFTRAST_Vector3Normalize(surfacenormal);
3794 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3796 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3797 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3798 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3799 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3801 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3802 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3803 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3804 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3806 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3807 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3808 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3809 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3811 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3812 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3813 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3814 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3816 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3817 DPSOFTRAST_Vector3Normalize(lightnormal);
3819 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3821 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3822 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3823 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3824 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3827 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3829 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3830 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3831 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3833 float f = 1.0f / 256.0f;
3834 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3835 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3836 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3839 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3841 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3842 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3843 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3844 DPSOFTRAST_Vector3Normalize(lightnormal);
3846 LightColor[0] = 1.0;
3847 LightColor[1] = 1.0;
3848 LightColor[2] = 1.0;
3852 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3853 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3854 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3855 DPSOFTRAST_Vector3Normalize(lightnormal);
3858 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3860 if(thread->shader_exactspecularmath)
3862 // reflect lightnormal at surfacenormal, take the negative of that
3863 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3865 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3866 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3867 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3868 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3870 // dot of this and normalize(EyeVectorFogDepth.xyz)
3871 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3872 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3873 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3874 DPSOFTRAST_Vector3Normalize(eyenormal);
3876 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3880 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3881 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3882 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3883 DPSOFTRAST_Vector3Normalize(eyenormal);
3885 specularnormal[0] = lightnormal[0] + eyenormal[0];
3886 specularnormal[1] = lightnormal[1] + eyenormal[1];
3887 specularnormal[2] = lightnormal[2] + eyenormal[2];
3888 DPSOFTRAST_Vector3Normalize(specularnormal);
3890 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3892 specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
3894 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3896 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3897 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3898 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3899 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3903 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3904 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3905 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3906 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3909 buffer_FragColorbgra8[x*4+0] = d[0];
3910 buffer_FragColorbgra8[x*4+1] = d[1];
3911 buffer_FragColorbgra8[x*4+2] = d[2];
3912 buffer_FragColorbgra8[x*4+3] = d[3];
3915 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3917 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3918 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3919 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3920 Color_Diffuse[3] = 0.0f;
3921 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3922 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3923 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3924 LightColor[3] = 0.0f;
3925 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3927 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3929 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3930 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3931 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3932 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3933 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3935 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3937 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3938 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3940 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3942 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3946 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3949 for (x = startx;x < endx;x++)
3952 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3953 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3954 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3955 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3956 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3957 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3958 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3959 DPSOFTRAST_Vector3Normalize(surfacenormal);
3961 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3963 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3964 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3965 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3966 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3968 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3969 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3970 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3971 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3973 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3974 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3975 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3976 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3978 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3979 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3980 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3981 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3983 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3984 DPSOFTRAST_Vector3Normalize(lightnormal);
3986 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3988 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3989 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3990 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3991 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3994 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3996 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3997 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3998 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4000 float f = 1.0f / 256.0f;
4001 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
4002 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
4003 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
4006 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
4008 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4009 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4010 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4011 DPSOFTRAST_Vector3Normalize(lightnormal);
4013 LightColor[0] = 1.0;
4014 LightColor[1] = 1.0;
4015 LightColor[2] = 1.0;
4019 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4020 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4021 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4022 DPSOFTRAST_Vector3Normalize(lightnormal);
4025 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4026 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4028 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4029 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4030 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4031 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4035 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4036 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4037 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4038 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4040 buffer_FragColorbgra8[x*4+0] = d[0];
4041 buffer_FragColorbgra8[x*4+1] = d[1];
4042 buffer_FragColorbgra8[x*4+2] = d[2];
4043 buffer_FragColorbgra8[x*4+3] = d[3];
4048 for (x = startx;x < endx;x++)
4051 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4052 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4053 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4054 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4056 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4058 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4059 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4060 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4061 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4065 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4066 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4067 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4068 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4070 buffer_FragColorbgra8[x*4+0] = d[0];
4071 buffer_FragColorbgra8[x*4+1] = d[1];
4072 buffer_FragColorbgra8[x*4+2] = d[2];
4073 buffer_FragColorbgra8[x*4+3] = d[3];
4076 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4081 static void DPSOFTRAST_VertexShader_LightSource(void)
4084 int numvertices = dpsoftrast.numvertices;
4085 float LightPosition[4];
4086 float LightVector[4];
4087 float LightVectorModelSpace[4];
4088 float EyePosition[4];
4089 float EyeVectorModelSpace[4];
4095 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4096 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4097 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4098 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4099 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4100 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4101 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4102 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4103 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4104 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4105 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4106 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4107 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4108 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4109 for (i = 0;i < numvertices;i++)
4111 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4112 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4113 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4114 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4115 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4116 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4117 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4118 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4119 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4120 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4121 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4122 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4123 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4124 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4125 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4126 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4127 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4128 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
4129 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4130 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4131 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4132 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4133 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4134 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4135 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4136 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4137 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4138 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4139 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4140 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4141 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4142 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4144 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4145 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4148 static void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4151 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4152 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4153 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4154 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4155 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4156 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4157 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4158 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4159 int x, startx = span->startx, endx = span->endx;
4160 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4161 float CubeVectordata[4];
4162 float CubeVectorslope[4];
4163 float LightVectordata[4];
4164 float LightVectorslope[4];
4165 float EyeVectordata[4];
4166 float EyeVectorslope[4];
4168 float diffusetex[4];
4170 float surfacenormal[4];
4171 float lightnormal[4];
4173 float specularnormal[4];
4176 float SpecularPower;
4177 float CubeVector[4];
4180 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4181 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4182 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4183 Color_Glow[3] = 0.0f;
4184 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4185 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4186 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4187 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4188 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4189 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4190 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4191 Color_Diffuse[3] = 0.0f;
4192 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4193 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4194 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4195 Color_Specular[3] = 0.0f;
4196 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4197 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4198 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4199 Color_Pants[3] = 0.0f;
4200 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4201 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4202 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4203 Color_Shirt[3] = 0.0f;
4204 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4205 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4206 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4207 LightColor[3] = 0.0f;
4208 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4209 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4210 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4211 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4212 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4213 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4214 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4215 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4217 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4218 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4220 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4221 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4222 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4224 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4225 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4226 for (x = startx;x < endx;x++)
4229 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4230 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4231 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4232 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4233 if (attenuation < 0.01f)
4235 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4237 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4238 if (attenuation < 0.01f)
4242 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4243 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4244 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4245 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4246 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4248 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4249 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4250 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4251 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4253 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4254 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4255 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4256 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4257 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4258 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4259 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4260 DPSOFTRAST_Vector3Normalize(surfacenormal);
4262 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4263 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4264 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4265 DPSOFTRAST_Vector3Normalize(lightnormal);
4267 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4269 if(thread->shader_exactspecularmath)
4271 // reflect lightnormal at surfacenormal, take the negative of that
4272 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4274 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4275 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4276 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4277 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4279 // dot of this and normalize(EyeVectorFogDepth.xyz)
4280 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4281 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4282 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4283 DPSOFTRAST_Vector3Normalize(eyenormal);
4285 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4289 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4290 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4291 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4292 DPSOFTRAST_Vector3Normalize(eyenormal);
4294 specularnormal[0] = lightnormal[0] + eyenormal[0];
4295 specularnormal[1] = lightnormal[1] + eyenormal[1];
4296 specularnormal[2] = lightnormal[2] + eyenormal[2];
4297 DPSOFTRAST_Vector3Normalize(specularnormal);
4299 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4301 specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
4303 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4305 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4306 attenuation *= (1.0f / 255.0f);
4307 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4308 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4309 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4310 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4314 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4315 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4316 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4317 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4319 buffer_FragColorbgra8[x*4+0] = d[0];
4320 buffer_FragColorbgra8[x*4+1] = d[1];
4321 buffer_FragColorbgra8[x*4+2] = d[2];
4322 buffer_FragColorbgra8[x*4+3] = d[3];
4325 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4327 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4328 for (x = startx;x < endx;x++)
4331 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4332 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4333 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4334 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4335 if (attenuation < 0.01f)
4337 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4339 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4340 if (attenuation < 0.01f)
4344 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4345 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4346 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4347 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4348 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4350 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4351 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4352 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4353 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4355 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4356 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4357 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4358 DPSOFTRAST_Vector3Normalize(surfacenormal);
4360 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4361 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4362 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4363 DPSOFTRAST_Vector3Normalize(lightnormal);
4365 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4366 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4368 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4369 attenuation *= (1.0f / 255.0f);
4370 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4371 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4372 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4373 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4377 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4378 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4379 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4380 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4382 buffer_FragColorbgra8[x*4+0] = d[0];
4383 buffer_FragColorbgra8[x*4+1] = d[1];
4384 buffer_FragColorbgra8[x*4+2] = d[2];
4385 buffer_FragColorbgra8[x*4+3] = d[3];
4390 for (x = startx;x < endx;x++)
4393 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4394 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4395 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4396 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4397 if (attenuation < 0.01f)
4399 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4401 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4402 if (attenuation < 0.01f)
4406 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4407 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4408 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4409 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4410 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4412 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4413 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4414 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4415 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4417 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4419 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4420 attenuation *= (1.0f / 255.0f);
4421 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4422 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4423 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4424 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4428 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4429 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4430 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4431 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4433 buffer_FragColorbgra8[x*4+0] = d[0];
4434 buffer_FragColorbgra8[x*4+1] = d[1];
4435 buffer_FragColorbgra8[x*4+2] = d[2];
4436 buffer_FragColorbgra8[x*4+3] = d[3];
4439 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4445 static void DPSOFTRAST_VertexShader_Refraction(void)
4447 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4448 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4449 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4452 static void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4454 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4456 int x, startx = span->startx, endx = span->endx;
4459 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4460 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4463 float ModelViewProjectionPositiondata[4];
4464 float ModelViewProjectionPositionslope[4];
4467 float ScreenScaleRefractReflect[2];
4468 float ScreenCenterRefractReflect[2];
4469 float DistortScaleRefractReflect[2];
4470 float RefractColor[4];
4472 DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4473 if(!texture) return;
4476 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4477 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4480 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4483 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4484 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4485 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4486 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4487 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4488 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4489 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4490 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4491 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4492 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4495 for (x = startx;x < endx;x++)
4497 float SafeScreenTexCoord[2];
4498 float ScreenTexCoord[2];
4505 // " vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4506 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4508 // " vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4509 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4510 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4512 // " vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4513 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4514 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4515 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4516 DPSOFTRAST_Vector3Normalize(v);
4517 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4518 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4520 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4521 DPSOFTRAST_Texture2DBGRA8(texture, 0, ScreenTexCoord[0], ScreenTexCoord[1], c);
4523 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4524 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4525 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4526 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4529 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4534 static void DPSOFTRAST_VertexShader_Water(void)
4537 int numvertices = dpsoftrast.numvertices;
4538 float EyePosition[4];
4539 float EyeVectorModelSpace[4];
4545 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4546 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4547 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4548 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4549 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4550 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4551 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4552 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4553 for (i = 0;i < numvertices;i++)
4555 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4556 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4557 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4558 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4559 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4560 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4561 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4562 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4563 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4564 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4565 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4566 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4567 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4568 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4569 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4570 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4571 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4572 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4573 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
4574 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
4575 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
4576 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
4578 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4579 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4580 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4584 static void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4586 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4588 int x, startx = span->startx, endx = span->endx;
4591 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4592 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4595 float ModelViewProjectionPositiondata[4];
4596 float ModelViewProjectionPositionslope[4];
4597 float EyeVectordata[4];
4598 float EyeVectorslope[4];
4601 float ScreenScaleRefractReflect[4];
4602 float ScreenCenterRefractReflect[4];
4603 float DistortScaleRefractReflect[4];
4604 float RefractColor[4];
4605 float ReflectColor[4];
4606 float ReflectFactor;
4607 float ReflectOffset;
4609 DPSOFTRAST_Texture *texture_refraction = thread->texbound[GL20TU_REFRACTION];
4610 DPSOFTRAST_Texture *texture_reflection = thread->texbound[GL20TU_REFLECTION];
4611 if(!texture_refraction || !texture_reflection) return;
4614 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4615 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4618 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4619 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
4622 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4623 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4624 ScreenScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+2];
4625 ScreenScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+3];
4626 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4627 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4628 ScreenCenterRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+2];
4629 ScreenCenterRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+3];
4630 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4631 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4632 DistortScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+2];
4633 DistortScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+3];
4634 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4635 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4636 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4637 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4638 ReflectColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+2];
4639 ReflectColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+1];
4640 ReflectColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+0];
4641 ReflectColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+3];
4642 ReflectFactor = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectFactor*4+0];
4643 ReflectOffset = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectOffset*4+0];
4646 for (x = startx;x < endx;x++)
4648 float SafeScreenTexCoord[4];
4649 float ScreenTexCoord[4];
4652 unsigned char c1[4];
4653 unsigned char c2[4];
4658 // " vec4 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect * (1.0 / ModelViewProjectionPosition.w);\n"
4659 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4661 // " vec4 SafeScreenTexCoord = ModelViewProjectionPosition.xyxy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect;\n"
4662 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4663 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4664 SafeScreenTexCoord[2] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[2] + ScreenCenterRefractReflect[2]; // * z (disappears)
4665 SafeScreenTexCoord[3] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[3] + ScreenCenterRefractReflect[3]; // * z (disappears)
4667 // " vec4 ScreenTexCoord = SafeScreenTexCoord + vec2(normalize(vec3(dp_texture2D(Texture_Normal, TexCoord)) - vec3(0.5))).xyxy * DistortScaleRefractReflect;\n"
4668 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4669 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4670 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4671 DPSOFTRAST_Vector3Normalize(v);
4672 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4673 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4674 ScreenTexCoord[2] = SafeScreenTexCoord[2] + v[0] * DistortScaleRefractReflect[2];
4675 ScreenTexCoord[3] = SafeScreenTexCoord[3] + v[1] * DistortScaleRefractReflect[3];
4677 // " float Fresnel = pow(min(1.0, 1.0 - float(normalize(EyeVector).z)), 2.0) * ReflectFactor + ReflectOffset;\n"
4678 v[0] = (EyeVectordata[0] + EyeVectorslope[0] * x); // * z (disappears)
4679 v[1] = (EyeVectordata[1] + EyeVectorslope[1] * x); // * z (disappears)
4680 v[2] = (EyeVectordata[2] + EyeVectorslope[2] * x); // * z (disappears)
4681 DPSOFTRAST_Vector3Normalize(v);
4682 Fresnel = 1.0f - v[2];
4683 Fresnel = min(1.0f, Fresnel);
4684 Fresnel = Fresnel * Fresnel * ReflectFactor + ReflectOffset;
4686 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4687 // " dp_FragColor = mix(vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord.xy).rgb, 1) * RefractColor, vec4(dp_texture2D(Texture_Reflection, ScreenTexCoord.zw).rgb, 1) * ReflectColor, Fresnel);\n"
4688 DPSOFTRAST_Texture2DBGRA8(texture_refraction, 0, ScreenTexCoord[0], ScreenTexCoord[1], c1);
4689 DPSOFTRAST_Texture2DBGRA8(texture_reflection, 0, ScreenTexCoord[2], ScreenTexCoord[3], c2);
4691 buffer_FragColorbgra8[x*4+0] = (c1[0] * RefractColor[0]) * (1.0f - Fresnel) + (c2[0] * ReflectColor[0]) * Fresnel;
4692 buffer_FragColorbgra8[x*4+1] = (c1[1] * RefractColor[1]) * (1.0f - Fresnel) + (c2[1] * ReflectColor[1]) * Fresnel;
4693 buffer_FragColorbgra8[x*4+2] = (c1[2] * RefractColor[2]) * (1.0f - Fresnel) + (c2[2] * ReflectColor[2]) * Fresnel;
4694 buffer_FragColorbgra8[x*4+3] = min(( RefractColor[3] * (1.0f - Fresnel) + ReflectColor[3] * Fresnel) * 256, 255);
4697 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4702 static void DPSOFTRAST_VertexShader_ShowDepth(void)
4704 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4707 static void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4710 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4711 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4712 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4713 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4714 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4719 static void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4721 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4724 static void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4727 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4728 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4729 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4730 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4731 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4736 static void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4738 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4741 static void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4744 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4745 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4746 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4747 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4748 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4753 typedef struct DPSOFTRAST_ShaderModeInfo_s
4756 void (*Vertex)(void);
4757 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4758 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4759 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4761 DPSOFTRAST_ShaderModeInfo;
4763 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4765 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4766 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4767 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4768 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4769 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4770 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4771 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4772 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4773 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4774 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4775 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4776 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4777 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4778 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4779 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_NORMAL, GL20TU_REFLECTION, GL20TU_REFRACTION, ~0}},
4780 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4781 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4782 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4785 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4790 unsigned int *depthpixel;
4794 unsigned char *pixelmask;
4795 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4796 startx = span->startx;
4798 depth = span->depthbase;
4799 depthslope = span->depthslope;
4800 pixelmask = thread->pixelmaskarray;
4801 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4803 switch(thread->fb_depthfunc)
4806 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4807 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4808 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4809 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4810 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4811 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4812 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4814 while (startx < endx && !pixelmask[startx])
4816 while (endx > startx && !pixelmask[endx-1])
4821 // no depth testing means we're just dealing with color...
4822 memset(pixelmask + startx, 1, endx - startx);
4824 span->pixelmask = pixelmask;
4825 span->startx = startx;
4829 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4831 int x, d, depth, depthslope, startx, endx;
4832 const unsigned char *pixelmask;
4833 unsigned int *depthpixel;
4834 if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4836 depth = span->depthbase;
4837 depthslope = span->depthslope;
4838 pixelmask = span->pixelmask;
4839 startx = span->startx;
4841 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4842 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4848 static void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4851 DPSOFTRAST_State_Triangle *triangle;
4852 DPSOFTRAST_State_Span *span;
4853 for (i = 0; i < thread->numspans; i++)
4855 span = &thread->spans[i];
4856 triangle = &thread->triangles[span->triangle];
4857 DPSOFTRAST_Draw_DepthTest(thread, span);
4858 if (span->startx >= span->endx)
4860 // run pixel shader if appropriate
4861 // do this before running depthmask code, to allow the pixelshader
4862 // to clear pixelmask values for alpha testing
4863 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4864 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4865 DPSOFTRAST_Draw_DepthWrite(thread, span);
4867 thread->numspans = 0;
4870 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;)
4872 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4875 int cullface = thread->cullface;
4876 int minx, maxx, miny, maxy;
4877 int miny1, maxy1, miny2, maxy2;
4878 __m128i fbmin, fbmax;
4879 __m128 viewportcenter, viewportscale;
4880 int firstvertex = command->firstvertex;
4881 int numvertices = command->numvertices;
4882 int numtriangles = command->numtriangles;
4883 const int *element3i = command->element3i;
4884 const unsigned short *element3s = command->element3s;
4885 int clipped = command->clipped;
4892 int starty, endy, bandy;
4896 float clip0origin, clip0slope;
4898 __m128 triangleedge1, triangleedge2, trianglenormal;
4901 DPSOFTRAST_State_Triangle *triangle;
4902 DPSOFTRAST_Texture *texture;
4903 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4904 miny = thread->fb_scissor[1];
4905 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4906 miny1 = bound(miny, thread->miny1, maxy);
4907 maxy1 = bound(miny, thread->maxy1, maxy);
4908 miny2 = bound(miny, thread->miny2, maxy);
4909 maxy2 = bound(miny, thread->maxy2, maxy);
4910 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4912 if (!ATOMIC_DECREMENT(command->refcount))
4914 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4915 MM_FREE(command->arrays);
4919 minx = thread->fb_scissor[0];
4920 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4921 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4922 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4923 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4924 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4925 screen[3] = _mm_setzero_ps();
4926 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4927 for (i = 0;i < numtriangles;i++)
4929 const float *screencoord4f = command->arrays;
4930 const float *arrays = screencoord4f + numvertices*4;
4932 // generate the 3 edges of this triangle
4933 // generate spans for the triangle - switch based on left split or right split classification of triangle
4936 e[0] = element3s[i*3+0] - firstvertex;
4937 e[1] = element3s[i*3+1] - firstvertex;
4938 e[2] = element3s[i*3+2] - firstvertex;
4942 e[0] = element3i[i*3+0] - firstvertex;
4943 e[1] = element3i[i*3+1] - firstvertex;
4944 e[2] = element3i[i*3+2] - firstvertex;
4953 #define SKIPBACKFACE \
4954 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4955 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4956 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4957 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4958 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4962 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4966 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4971 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4972 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4974 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4975 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4977 #define CLIPPEDVERTEXCOPY(k,p1) \
4978 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4980 #define GENATTRIBCOPY(attrib, p1) \
4981 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4982 #define GENATTRIBLERP(attrib, p1, p2) \
4984 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4985 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4987 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4991 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4992 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4993 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4994 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4995 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4996 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4997 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
5003 // calculate distance from nearplane
5004 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
5005 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
5006 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
5007 if (clipdist[0] >= 0.0f)
5009 if (clipdist[1] >= 0.0f)
5011 if (clipdist[2] >= 0.0f)
5014 // triangle is entirely in front of nearplane
5015 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
5022 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
5030 if (clipdist[2] >= 0.0f)
5032 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
5039 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
5046 else if (clipdist[1] >= 0.0f)
5048 if (clipdist[2] >= 0.0f)
5050 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
5057 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
5063 else if (clipdist[2] >= 0.0f)
5065 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
5070 else continue; // triangle is entirely behind nearplane
5073 // calculate integer y coords for triangle points
5074 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
5075 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
5076 screenmin = _mm_min_epi16(screeni, screenir),
5077 screenmax = _mm_max_epi16(screeni, screenir);
5078 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
5079 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
5080 screenmin = _mm_max_epi16(screenmin, fbmin);
5081 screenmax = _mm_min_epi16(screenmax, fbmax);
5082 // skip offscreen triangles
5083 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5085 starty = _mm_extract_epi16(screenmin, 1);
5086 endy = _mm_extract_epi16(screenmax, 1)+1;
5087 if (starty >= maxy1 && endy <= miny2)
5089 screeny = _mm_srai_epi32(screeni, 16);
5092 triangle = &thread->triangles[thread->numtriangles];
5094 // calculate attribute plans for triangle data...
5095 // okay, this triangle is going to produce spans, we'd better project
5096 // the interpolants now (this is what gives perspective texturing),
5097 // this consists of simply multiplying all arrays by the W coord
5098 // (which is basically 1/Z), which will be undone per-pixel
5099 // (multiplying by Z again) to get the perspective-correct array
5102 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5103 __m128 mipedgescale, mipdensity;
5104 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5105 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5106 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5107 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5108 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5109 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5110 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5111 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5112 attribedge1 = _mm_sub_ss(w0, w1);
5113 attribedge2 = _mm_sub_ss(w2, w1);
5114 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5115 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5116 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5117 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5118 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5119 _mm_store_ss(&triangle->w[0], attribxslope);
5120 _mm_store_ss(&triangle->w[1], attribyslope);
5121 _mm_store_ss(&triangle->w[2], attriborigin);
5126 if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
5128 float cliporigin, clipxslope, clipyslope;
5129 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5130 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5131 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5132 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5133 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5134 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5135 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
5136 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
5137 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
5140 clip0origin = -cliporigin/clipxslope;
5141 clip0slope = -clipyslope/clipxslope;
5142 clip0dir = clipxslope > 0 ? 1 : -1;
5144 else if(clipyslope > 0)
5146 clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5147 clip0slope = dpsoftrast.fb_width;
5150 else if(clipyslope < 0)
5152 clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5153 clip0slope = -dpsoftrast.fb_width;
5156 else if(clip0origin < 0) continue;
5159 mipedgescale = _mm_setzero_ps();
5160 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5162 __m128 attrib0, attrib1, attrib2;
5163 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5164 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5166 arrays += numvertices*4;
5167 GENATTRIBS(attrib0, attrib1, attrib2);
5168 attriborigin = _mm_mul_ps(attrib1, w1);
5169 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5170 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5171 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5172 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5173 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5174 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5175 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5176 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5177 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5179 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5180 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5181 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5182 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5186 memset(triangle->mip, 0, sizeof(triangle->mip));
5187 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5189 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5190 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5192 texture = thread->texbound[texunit];
5193 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5195 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5196 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5197 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5198 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5199 // this will be multiplied in the texturing routine by the texture resolution
5200 y = _mm_cvtss_si32(mipdensity);
5203 y = (int)(log((float)y)*0.5f/M_LN2);
5204 if (y > texture->mipmaps - 1)
5205 y = texture->mipmaps - 1;
5206 triangle->mip[texunit] = y;
5212 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5215 __m128 xcoords, xslope;
5216 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5217 int yccmask = _mm_movemask_epi8(ycc);
5218 int edge0p, edge0n, edge1p, edge1n;
5227 case 0xFFFF: /*0000*/ y = endy; continue;
5228 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5229 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5230 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5231 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5232 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5233 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5234 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5235 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5236 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5237 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5238 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5239 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5240 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5241 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5242 case 0x0000: /*1111*/ y++; continue;
5250 case 0xFFFF: /*000*/ y = endy; continue;
5251 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5252 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5253 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5254 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5255 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5256 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5257 case 0x0000: /*111*/ y++; continue;
5260 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5261 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5262 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5263 nexty = _mm_extract_epi16(ycc, 0);
5264 if (nexty >= bandy) nexty = bandy-1;
5265 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5266 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5267 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5268 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5269 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5270 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5272 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5273 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5275 clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5276 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5278 int startx, endx, offset;
5279 startx = _mm_cvtss_si32(xcoords);
5280 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5281 if (startx < minx) startx = minx;
5282 if (endx > maxx) endx = maxx;
5283 if (startx >= endx) continue;
5291 if(endx <= clip0) continue;
5292 startx = (int)clip0;
5295 else if (endx > clip0)
5297 if(startx >= clip0) continue;
5302 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5304 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5305 span->triangle = thread->numtriangles;
5309 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5310 if (span->startx >= span->endx)
5312 wslope = triangle->w[0];
5313 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5314 span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5315 span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5316 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5317 DPSOFTRAST_Draw_ProcessSpans(thread);
5322 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5324 DPSOFTRAST_Draw_ProcessSpans(thread);
5325 thread->numtriangles = 0;
5329 if (!ATOMIC_DECREMENT(command->refcount))
5331 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5332 MM_FREE(command->arrays);
5335 if (thread->numspans > 0 || thread->numtriangles > 0)
5337 DPSOFTRAST_Draw_ProcessSpans(thread);
5338 thread->numtriangles = 0;
5343 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5347 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5348 int datasize = 2*numvertices*sizeof(float[4]);
5349 DPSOFTRAST_Command_Draw *command;
5350 unsigned char *data;
5351 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5353 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5354 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5356 datasize += numvertices*sizeof(float[4]);
5359 datasize += numtriangles*sizeof(unsigned short[3]);
5361 datasize += numtriangles*sizeof(int[3]);
5362 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5363 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5365 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5366 data = (unsigned char *)MM_CALLOC(datasize, 1);
5370 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5371 data = (unsigned char *)command + commandsize;
5373 command->firstvertex = firstvertex;
5374 command->numvertices = numvertices;
5375 command->numtriangles = numtriangles;
5376 command->arrays = (float *)data;
5377 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5378 dpsoftrast.firstvertex = firstvertex;
5379 dpsoftrast.numvertices = numvertices;
5380 dpsoftrast.screencoord4f = (float *)data;
5381 data += numvertices*sizeof(float[4]);
5382 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5383 data += numvertices*sizeof(float[4]);
5384 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5386 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5387 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5389 dpsoftrast.post_array4f[j] = (float *)data;
5390 data += numvertices*sizeof(float[4]);
5392 command->element3i = NULL;
5393 command->element3s = NULL;
5396 command->element3s = (unsigned short *)data;
5397 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5401 command->element3i = (int *)data;
5402 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5407 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5409 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5410 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5411 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5412 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5413 if (command->starty >= command->endy)
5415 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5416 MM_FREE(command->arrays);
5417 DPSOFTRAST_UndoCommand(command->commandsize);
5420 command->clipped = dpsoftrast.drawclipped;
5421 command->refcount = dpsoftrast.numthreads;
5423 if (dpsoftrast.usethreads)
5426 DPSOFTRAST_Draw_SyncCommands();
5427 for (i = 0; i < dpsoftrast.numthreads; i++)
5429 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5430 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5431 Thread_CondSignal(thread->drawcond);
5436 DPSOFTRAST_Draw_FlushThreads();
5440 DEFCOMMAND(23, SetRenderTargets, int width; int height;)
5441 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5443 thread->validate |= DPSOFTRAST_VALIDATE_FB;
5445 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5447 DPSOFTRAST_Command_SetRenderTargets *command;
5448 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5449 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5450 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5452 dpsoftrast.fb_width = width;
5453 dpsoftrast.fb_height = height;
5454 dpsoftrast.fb_depthpixels = depthpixels;
5455 dpsoftrast.fb_colorpixels[0] = colorpixels0;
5456 dpsoftrast.fb_colorpixels[1] = colorpixels1;
5457 dpsoftrast.fb_colorpixels[2] = colorpixels2;
5458 dpsoftrast.fb_colorpixels[3] = colorpixels3;
5459 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5460 command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5461 command->width = width;
5462 command->height = height;
5465 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5467 int commandoffset = thread->commandoffset;
5468 while (commandoffset != endoffset)
5470 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5471 switch (command->opcode)
5473 #define INTERPCOMMAND(name) \
5474 case DPSOFTRAST_OPCODE_##name : \
5475 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5476 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5477 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5478 commandoffset = 0; \
5480 INTERPCOMMAND(Viewport)
5481 INTERPCOMMAND(ClearColor)
5482 INTERPCOMMAND(ClearDepth)
5483 INTERPCOMMAND(ColorMask)
5484 INTERPCOMMAND(DepthTest)
5485 INTERPCOMMAND(ScissorTest)
5486 INTERPCOMMAND(Scissor)
5487 INTERPCOMMAND(BlendFunc)
5488 INTERPCOMMAND(BlendSubtract)
5489 INTERPCOMMAND(DepthMask)
5490 INTERPCOMMAND(DepthFunc)
5491 INTERPCOMMAND(DepthRange)
5492 INTERPCOMMAND(PolygonOffset)
5493 INTERPCOMMAND(CullFace)
5494 INTERPCOMMAND(SetTexture)
5495 INTERPCOMMAND(SetShader)
5496 INTERPCOMMAND(Uniform4f)
5497 INTERPCOMMAND(UniformMatrix4f)
5498 INTERPCOMMAND(Uniform1i)
5499 INTERPCOMMAND(SetRenderTargets)
5500 INTERPCOMMAND(ClipPlane)
5502 case DPSOFTRAST_OPCODE_Draw:
5503 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5504 commandoffset += command->commandsize;
5505 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5507 thread->commandoffset = commandoffset;
5510 case DPSOFTRAST_OPCODE_Reset:
5515 thread->commandoffset = commandoffset;
5518 static int DPSOFTRAST_Draw_Thread(void *data)
5520 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5521 while(thread->index >= 0)
5523 if (thread->commandoffset != dpsoftrast.drawcommand)
5525 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5529 Thread_LockMutex(thread->drawmutex);
5530 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5532 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5533 thread->starving = true;
5534 Thread_CondWait(thread->drawcond, thread->drawmutex);
5535 thread->starving = false;
5537 Thread_UnlockMutex(thread->drawmutex);
5543 static void DPSOFTRAST_Draw_FlushThreads(void)
5545 DPSOFTRAST_State_Thread *thread;
5547 DPSOFTRAST_Draw_SyncCommands();
5548 if (dpsoftrast.usethreads)
5550 for (i = 0; i < dpsoftrast.numthreads; i++)
5552 thread = &dpsoftrast.threads[i];
5553 if (thread->commandoffset != dpsoftrast.drawcommand)
5555 Thread_LockMutex(thread->drawmutex);
5556 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5557 Thread_CondSignal(thread->drawcond);
5558 Thread_UnlockMutex(thread->drawmutex);
5561 for (i = 0; i < dpsoftrast.numthreads; i++)
5563 thread = &dpsoftrast.threads[i];
5564 if (thread->commandoffset != dpsoftrast.drawcommand)
5566 Thread_LockMutex(thread->drawmutex);
5567 if (thread->commandoffset != dpsoftrast.drawcommand)
5569 thread->waiting = true;
5570 Thread_CondWait(thread->waitcond, thread->drawmutex);
5571 thread->waiting = false;
5573 Thread_UnlockMutex(thread->drawmutex);
5579 for (i = 0; i < dpsoftrast.numthreads; i++)
5581 thread = &dpsoftrast.threads[i];
5582 if (thread->commandoffset != dpsoftrast.drawcommand)
5583 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5586 dpsoftrast.commandpool.usedcommands = 0;
5589 void DPSOFTRAST_Flush(void)
5591 DPSOFTRAST_Draw_FlushThreads();
5594 void DPSOFTRAST_Finish(void)
5599 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5609 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5610 dpsoftrast.bigendian = u.b[3];
5611 dpsoftrast.fb_width = width;
5612 dpsoftrast.fb_height = height;
5613 dpsoftrast.fb_depthpixels = depthpixels;
5614 dpsoftrast.fb_colorpixels[0] = colorpixels;
5615 dpsoftrast.fb_colorpixels[1] = NULL;
5616 dpsoftrast.fb_colorpixels[1] = NULL;
5617 dpsoftrast.fb_colorpixels[1] = NULL;
5618 dpsoftrast.viewport[0] = 0;
5619 dpsoftrast.viewport[1] = 0;
5620 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5621 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5622 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5623 dpsoftrast.texture_firstfree = 1;
5624 dpsoftrast.texture_end = 1;
5625 dpsoftrast.texture_max = 0;
5626 dpsoftrast.color[0] = 1;
5627 dpsoftrast.color[1] = 1;
5628 dpsoftrast.color[2] = 1;
5629 dpsoftrast.color[3] = 1;
5630 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5631 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5632 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5633 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5634 for (i = 0; i < dpsoftrast.numthreads; i++)
5636 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5638 thread->cullface = GL_BACK;
5639 thread->colormask[0] = 1;
5640 thread->colormask[1] = 1;
5641 thread->colormask[2] = 1;
5642 thread->colormask[3] = 1;
5643 thread->blendfunc[0] = GL_ONE;
5644 thread->blendfunc[1] = GL_ZERO;
5645 thread->depthmask = true;
5646 thread->depthtest = true;
5647 thread->depthfunc = GL_LEQUAL;
5648 thread->scissortest = false;
5649 thread->viewport[0] = 0;
5650 thread->viewport[1] = 0;
5651 thread->viewport[2] = dpsoftrast.fb_width;
5652 thread->viewport[3] = dpsoftrast.fb_height;
5653 thread->scissor[0] = 0;
5654 thread->scissor[1] = 0;
5655 thread->scissor[2] = dpsoftrast.fb_width;
5656 thread->scissor[3] = dpsoftrast.fb_height;
5657 thread->depthrange[0] = 0;
5658 thread->depthrange[1] = 1;
5659 thread->polygonoffset[0] = 0;
5660 thread->polygonoffset[1] = 0;
5661 thread->clipplane[0] = 0;
5662 thread->clipplane[1] = 0;
5663 thread->clipplane[2] = 0;
5664 thread->clipplane[3] = 1;
5666 thread->numspans = 0;
5667 thread->numtriangles = 0;
5668 thread->commandoffset = 0;
5669 thread->waiting = false;
5670 thread->starving = false;
5672 thread->validate = -1;
5673 DPSOFTRAST_Validate(thread, -1);
5675 if (dpsoftrast.usethreads)
5677 thread->waitcond = Thread_CreateCond();
5678 thread->drawcond = Thread_CreateCond();
5679 thread->drawmutex = Thread_CreateMutex();
5680 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5686 void DPSOFTRAST_Shutdown(void)
5689 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5691 DPSOFTRAST_State_Thread *thread;
5692 for (i = 0; i < dpsoftrast.numthreads; i++)
5694 thread = &dpsoftrast.threads[i];
5695 Thread_LockMutex(thread->drawmutex);
5697 Thread_CondSignal(thread->drawcond);
5698 Thread_UnlockMutex(thread->drawmutex);
5699 Thread_WaitThread(thread->thread, 0);
5700 Thread_DestroyCond(thread->waitcond);
5701 Thread_DestroyCond(thread->drawcond);
5702 Thread_DestroyMutex(thread->drawmutex);
5705 for (i = 0;i < dpsoftrast.texture_end;i++)
5706 if (dpsoftrast.texture[i].bytes)
5707 MM_FREE(dpsoftrast.texture[i].bytes);
5708 if (dpsoftrast.texture)
5709 free(dpsoftrast.texture);
5710 if (dpsoftrast.threads)
5711 MM_FREE(dpsoftrast.threads);
5712 memset(&dpsoftrast, 0, sizeof(dpsoftrast));