3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 #pragma warning(disable : 4324)
14 typedef qboolean bool;
21 #if defined(__APPLE__)
22 #include <libkern/OSAtomic.h>
23 #define ALIGN(var) var __attribute__((__aligned__(16)))
24 #define ATOMIC(var) var __attribute__((__aligned__(4)))
25 #define MEMORY_BARRIER (_mm_sfence())
26 #define ATOMIC_COUNTER volatile int32_t
27 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30 #elif defined(__GNUC__) && defined(WIN32)
31 #define ALIGN(var) var __attribute__((__aligned__(16)))
32 #define ATOMIC(var) var __attribute__((__aligned__(4)))
33 #define MEMORY_BARRIER (_mm_sfence())
34 //(__sync_synchronize())
35 #define ATOMIC_COUNTER volatile LONG
36 // this LONG * cast serves to fix an issue with broken mingw
37 // packages on Ubuntu; these only declare the function to take
38 // a LONG *, causing a compile error here. This seems to be
39 // error- and warn-free on platforms that DO declare
40 // InterlockedIncrement correctly, like mingw on Windows.
41 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement((LONG *) &(counter)))
42 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement((LONG *) &(counter)))
43 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd((LONG *) &(counter), (val)))
44 #elif defined(__GNUC__)
45 #define ALIGN(var) var __attribute__((__aligned__(16)))
46 #define ATOMIC(var) var __attribute__((__aligned__(4)))
47 #define MEMORY_BARRIER (_mm_sfence())
48 //(__sync_synchronize())
49 #define ATOMIC_COUNTER volatile int
50 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
51 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
52 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
53 #elif defined(_MSC_VER)
54 #define ALIGN(var) __declspec(align(16)) var
55 #define ATOMIC(var) __declspec(align(4)) var
56 #define MEMORY_BARRIER (_mm_sfence())
58 #define ATOMIC_COUNTER volatile LONG
59 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
60 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
61 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
66 #define ALIGN(var) var
69 #define ATOMIC(var) var
71 #ifndef MEMORY_BARRIER
72 #define MEMORY_BARRIER ((void)0)
74 #ifndef ATOMIC_COUNTER
75 #define ATOMIC_COUNTER int
77 #ifndef ATOMIC_INCREMENT
78 #define ATOMIC_INCREMENT(counter) (++(counter))
80 #ifndef ATOMIC_DECREMENT
81 #define ATOMIC_DECREMENT(counter) (--(counter))
84 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
88 #include <emmintrin.h>
90 #if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6) && !defined(__clang__)
91 #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
94 #define MM_MALLOC(size) _mm_malloc(size, ALIGN_SIZE)
96 static void *MM_CALLOC(size_t nmemb, size_t size)
98 void *ptr = _mm_malloc(nmemb*size, ALIGN_SIZE);
99 if (ptr != NULL) memset(ptr, 0, nmemb*size);
103 #define MM_FREE _mm_free
105 #define MM_MALLOC(size) malloc(size)
106 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
110 typedef enum DPSOFTRAST_ARRAY_e
112 DPSOFTRAST_ARRAY_POSITION,
113 DPSOFTRAST_ARRAY_COLOR,
114 DPSOFTRAST_ARRAY_TEXCOORD0,
115 DPSOFTRAST_ARRAY_TEXCOORD1,
116 DPSOFTRAST_ARRAY_TEXCOORD2,
117 DPSOFTRAST_ARRAY_TEXCOORD3,
118 DPSOFTRAST_ARRAY_TEXCOORD4,
119 DPSOFTRAST_ARRAY_TEXCOORD5,
120 DPSOFTRAST_ARRAY_TEXCOORD6,
121 DPSOFTRAST_ARRAY_TEXCOORD7,
122 DPSOFTRAST_ARRAY_TOTAL
126 typedef struct DPSOFTRAST_Texture_s
133 DPSOFTRAST_TEXTURE_FILTER filter;
136 ATOMIC_COUNTER binds;
137 unsigned char *bytes;
138 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
142 #define COMMAND_SIZE ALIGN_SIZE
143 #define COMMAND_ALIGN(var) ALIGN(var)
145 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
147 unsigned char opcode;
148 unsigned short commandsize;
152 enum { DPSOFTRAST_OPCODE_Reset = 0 };
154 #define DEFCOMMAND(opcodeval, name, fields) \
155 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
156 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
158 unsigned char opcode; \
159 unsigned short commandsize; \
161 } DPSOFTRAST_Command_##name );
163 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
164 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
166 typedef ALIGN(struct DPSOFTRAST_State_Command_Pool_s
170 ALIGN(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
172 DPSOFTRAST_State_Command_Pool);
174 typedef ALIGN(struct DPSOFTRAST_State_Triangle_s
176 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
178 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
180 DPSOFTRAST_State_Triangle);
182 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
183 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
184 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
185 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
186 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
188 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
189 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
190 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
191 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
192 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
193 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
194 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
195 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
196 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
199 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
201 typedef ALIGN(struct DPSOFTRAST_State_Span_s
203 int triangle; // triangle this span was generated by
204 int x; // framebuffer x coord
205 int y; // framebuffer y coord
206 int startx; // usable range (according to pixelmask)
207 int endx; // usable range (according to pixelmask)
208 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
209 int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
210 int depthslope; // depthbuffer value pixel delta
212 DPSOFTRAST_State_Span);
214 #define DPSOFTRAST_DRAW_MAXSPANS 1024
215 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
216 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
218 #define DPSOFTRAST_VALIDATE_FB 1
219 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
220 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
221 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
223 typedef enum DPSOFTRAST_BLENDMODE_e
225 DPSOFTRAST_BLENDMODE_OPAQUE,
226 DPSOFTRAST_BLENDMODE_ALPHA,
227 DPSOFTRAST_BLENDMODE_ADDALPHA,
228 DPSOFTRAST_BLENDMODE_ADD,
229 DPSOFTRAST_BLENDMODE_INVMOD,
230 DPSOFTRAST_BLENDMODE_MUL,
231 DPSOFTRAST_BLENDMODE_MUL2,
232 DPSOFTRAST_BLENDMODE_SUBALPHA,
233 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
234 DPSOFTRAST_BLENDMODE_INVADD,
235 DPSOFTRAST_BLENDMODE_TOTAL
237 DPSOFTRAST_BLENDMODE;
239 typedef ALIGN(struct DPSOFTRAST_State_Thread_s
255 float polygonoffset[2];
257 ALIGN(float fb_clipplane[4]);
260 int shader_permutation;
261 int shader_exactspecularmath;
263 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
265 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
266 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
268 // DPSOFTRAST_VALIDATE_ flags
271 // derived values (DPSOFTRAST_VALIDATE_FB)
274 ALIGN(float fb_viewportcenter[4]);
275 ALIGN(float fb_viewportscale[4]);
277 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
280 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
289 ATOMIC(volatile int commandoffset);
291 volatile bool waiting;
292 volatile bool starving;
299 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
300 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
301 unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
303 DPSOFTRAST_State_Thread);
305 typedef ALIGN(struct DPSOFTRAST_State_s
309 unsigned int *fb_depthpixels;
310 unsigned int *fb_colorpixels[4];
313 ALIGN(float fb_viewportcenter[4]);
314 ALIGN(float fb_viewportscale[4]);
317 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
318 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
320 const float *pointer_vertex3f;
321 const float *pointer_color4f;
322 const unsigned char *pointer_color4ub;
323 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
326 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
327 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
328 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
332 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
333 float *screencoord4f;
339 int shader_permutation;
340 int shader_exactspecularmath;
344 int texture_firstfree;
345 DPSOFTRAST_Texture *texture;
350 const char *errorstring;
355 DPSOFTRAST_State_Thread *threads;
357 ATOMIC(volatile int drawcommand);
359 DPSOFTRAST_State_Command_Pool commandpool;
363 DPSOFTRAST_State dpsoftrast;
365 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
366 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
367 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
368 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
370 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
371 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
373 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
375 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
376 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
377 fb_viewportcenter[3] = 0.5f;
378 fb_viewportcenter[0] = 0.0f;
379 fb_viewportscale[1] = 0.5f * viewport[2];
380 fb_viewportscale[2] = -0.5f * viewport[3];
381 fb_viewportscale[3] = 0.5f;
382 fb_viewportscale[0] = 1.0f;
385 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
387 if (dpsoftrast.interlace)
389 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
390 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
391 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
392 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
396 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
397 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
401 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
403 thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
404 thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
405 thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
406 thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
407 thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
410 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
412 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
413 // and viewport projection values
416 x1 = thread->scissor[0];
417 x2 = thread->scissor[0] + thread->scissor[2];
418 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
419 y2 = dpsoftrast.fb_height - thread->scissor[1];
420 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
422 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
424 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
425 thread->fb_scissor[0] = x1;
426 thread->fb_scissor[1] = y1;
427 thread->fb_scissor[2] = x2 - x1;
428 thread->fb_scissor[3] = y2 - y1;
430 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
431 DPSOFTRAST_RecalcClipPlane(thread);
432 DPSOFTRAST_RecalcThread(thread);
435 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
437 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
440 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
442 if (thread->blendsubtract)
444 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
446 #define BLENDFUNC(sfactor, dfactor, blendmode) \
447 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
448 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
449 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
454 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
456 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
457 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
458 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
459 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
460 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
461 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
462 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
463 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
464 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
465 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
466 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
471 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
473 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
475 mask &= thread->validate;
478 if (mask & DPSOFTRAST_VALIDATE_FB)
480 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
481 DPSOFTRAST_RecalcFB(thread);
483 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
485 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
486 DPSOFTRAST_RecalcDepthFunc(thread);
488 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
490 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
491 DPSOFTRAST_RecalcBlendFunc(thread);
495 static DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
497 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
498 return &dpsoftrast.texture[index];
502 static void DPSOFTRAST_Texture_Grow(void)
504 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
505 DPSOFTRAST_State_Thread *thread;
509 // expand texture array as needed
510 if (dpsoftrast.texture_max < 1024)
511 dpsoftrast.texture_max = 1024;
513 dpsoftrast.texture_max *= 2;
514 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
515 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
516 if (dpsoftrast.texbound[i])
517 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
518 for (j = 0; j < dpsoftrast.numthreads; j++)
520 thread = &dpsoftrast.threads[j];
521 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
522 if (thread->texbound[i])
523 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
527 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
536 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
537 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
538 DPSOFTRAST_Texture *texture;
539 if (width*height*depth < 1)
541 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
544 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
546 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
551 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
552 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
553 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
555 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
556 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
558 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
563 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
566 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
568 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
573 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
575 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
578 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
580 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
583 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
585 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
588 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
590 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
593 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
595 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
598 // find first empty slot in texture array
599 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
600 if (!dpsoftrast.texture[texnum].bytes)
602 dpsoftrast.texture_firstfree = texnum + 1;
603 if (dpsoftrast.texture_max <= texnum)
604 DPSOFTRAST_Texture_Grow();
605 if (dpsoftrast.texture_end <= texnum)
606 dpsoftrast.texture_end = texnum + 1;
607 texture = &dpsoftrast.texture[texnum];
608 memset(texture, 0, sizeof(*texture));
609 texture->flags = flags;
610 texture->width = width;
611 texture->height = height;
612 texture->depth = depth;
613 texture->sides = sides;
625 s = w * h * d * sides * 4;
626 texture->mipmap[mipmaps][0] = size;
627 texture->mipmap[mipmaps][1] = s;
628 texture->mipmap[mipmaps][2] = w;
629 texture->mipmap[mipmaps][3] = h;
630 texture->mipmap[mipmaps][4] = d;
633 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
639 texture->mipmaps = mipmaps;
640 texture->size = size;
642 // allocate the pixels now
643 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
647 void DPSOFTRAST_Texture_Free(int index)
649 DPSOFTRAST_Texture *texture;
650 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
654 MM_FREE(texture->bytes);
655 texture->bytes = NULL;
656 memset(texture, 0, sizeof(*texture));
657 // adjust the free range and used range
658 if (dpsoftrast.texture_firstfree > index)
659 dpsoftrast.texture_firstfree = index;
660 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
661 dpsoftrast.texture_end--;
663 static void DPSOFTRAST_Texture_CalculateMipmaps(int index)
665 int i, x, y, z, w, layer0, layer1, row0, row1;
666 unsigned char *o, *i0, *i1, *i2, *i3;
667 DPSOFTRAST_Texture *texture;
668 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
669 if (texture->mipmaps <= 1)
671 for (i = 1;i < texture->mipmaps;i++)
673 for (z = 0;z < texture->mipmap[i][4];z++)
677 if (layer1 >= texture->mipmap[i-1][4])
678 layer1 = texture->mipmap[i-1][4]-1;
679 for (y = 0;y < texture->mipmap[i][3];y++)
683 if (row1 >= texture->mipmap[i-1][3])
684 row1 = texture->mipmap[i-1][3]-1;
685 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
686 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
687 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
688 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
689 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
690 w = texture->mipmap[i][2];
693 if (texture->mipmap[i-1][2] > 1)
695 // average 3D texture
696 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
698 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
699 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
700 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
701 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
706 // average 3D mipmap with parent width == 1
707 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
709 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
710 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
711 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
712 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
718 if (texture->mipmap[i-1][2] > 1)
720 // average 2D texture (common case)
721 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
723 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
724 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
725 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
726 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
731 // 2D texture with parent width == 1
732 o[0] = (i0[0] + i1[0] + 1) >> 1;
733 o[1] = (i0[1] + i1[1] + 1) >> 1;
734 o[2] = (i0[2] + i1[2] + 1) >> 1;
735 o[3] = (i0[3] + i1[3] + 1) >> 1;
742 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
744 DPSOFTRAST_Texture *texture;
746 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
751 dst = texture->bytes + texture->mipmap[0][1] +(-blocky * texture->mipmap[0][2] + blockx) * 4;
752 while (blockheight > 0)
754 dst -= texture->mipmap[0][2] * 4;
755 memcpy(dst, pixels, blockwidth * 4);
756 pixels += blockwidth * 4;
760 DPSOFTRAST_Texture_CalculateMipmaps(index);
762 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
764 DPSOFTRAST_Texture *texture;
765 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
770 int i, stride = texture->mipmap[0][2]*4;
771 unsigned char *dst = texture->bytes + texture->mipmap[0][1];
772 for (i = texture->mipmap[0][3];i > 0;i--)
775 memcpy(dst, pixels, stride);
779 DPSOFTRAST_Texture_CalculateMipmaps(index);
781 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
783 DPSOFTRAST_Texture *texture;
784 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
785 return texture->mipmap[mip][2];
787 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
789 DPSOFTRAST_Texture *texture;
790 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
791 return texture->mipmap[mip][3];
793 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
795 DPSOFTRAST_Texture *texture;
796 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
797 return texture->mipmap[mip][4];
799 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
801 DPSOFTRAST_Texture *texture;
802 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
805 return texture->bytes + texture->mipmap[mip][0];
807 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
809 DPSOFTRAST_Texture *texture;
810 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
811 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
813 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
818 texture->filter = filter;
821 static void DPSOFTRAST_Draw_FlushThreads(void);
823 static void DPSOFTRAST_Draw_SyncCommands(void)
825 if(dpsoftrast.usethreads) MEMORY_BARRIER;
826 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
829 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
831 DPSOFTRAST_State_Thread *thread;
833 int freecommand = dpsoftrast.commandpool.freecommand;
834 int usedcommands = dpsoftrast.commandpool.usedcommands;
835 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
837 DPSOFTRAST_Draw_SyncCommands();
843 for (i = 0; i < dpsoftrast.numthreads; i++)
845 thread = &dpsoftrast.threads[i];
846 commandoffset = freecommand - thread->commandoffset;
847 if (commandoffset < 0)
848 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
849 if (commandoffset > usedcommands)
852 usedcommands = commandoffset;
855 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
857 thread = &dpsoftrast.threads[waitindex];
858 Thread_LockMutex(thread->drawmutex);
859 if (thread->commandoffset != dpsoftrast.drawcommand)
861 thread->waiting = true;
862 if (thread->starving) Thread_CondSignal(thread->drawcond);
863 Thread_CondWait(thread->waitcond, thread->drawmutex);
864 thread->waiting = false;
866 Thread_UnlockMutex(thread->drawmutex);
868 dpsoftrast.commandpool.usedcommands = usedcommands;
871 #define DPSOFTRAST_ALIGNCOMMAND(size) \
872 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
873 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
874 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
876 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
878 DPSOFTRAST_Command *command;
879 int freecommand = dpsoftrast.commandpool.freecommand;
880 int usedcommands = dpsoftrast.commandpool.usedcommands;
881 int extra = sizeof(DPSOFTRAST_Command);
882 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
883 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
884 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
886 if (dpsoftrast.usethreads)
887 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
889 DPSOFTRAST_Draw_FlushThreads();
890 freecommand = dpsoftrast.commandpool.freecommand;
891 usedcommands = dpsoftrast.commandpool.usedcommands;
893 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
895 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
896 command->opcode = DPSOFTRAST_OPCODE_Reset;
897 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
900 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
901 command->opcode = opcode;
902 command->commandsize = size;
904 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
906 dpsoftrast.commandpool.freecommand = freecommand;
907 dpsoftrast.commandpool.usedcommands = usedcommands + size;
911 static void DPSOFTRAST_UndoCommand(int size)
913 int freecommand = dpsoftrast.commandpool.freecommand;
914 int usedcommands = dpsoftrast.commandpool.usedcommands;
917 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
918 usedcommands -= size;
919 dpsoftrast.commandpool.freecommand = freecommand;
920 dpsoftrast.commandpool.usedcommands = usedcommands;
923 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
924 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
926 thread->viewport[0] = command->x;
927 thread->viewport[1] = command->y;
928 thread->viewport[2] = command->width;
929 thread->viewport[3] = command->height;
930 thread->validate |= DPSOFTRAST_VALIDATE_FB;
932 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
934 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
937 command->width = width;
938 command->height = height;
940 dpsoftrast.viewport[0] = x;
941 dpsoftrast.viewport[1] = y;
942 dpsoftrast.viewport[2] = width;
943 dpsoftrast.viewport[3] = height;
944 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
947 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
948 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
950 int i, x1, y1, x2, y2, w, h, x, y;
951 int miny1, maxy1, miny2, maxy2;
955 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
956 miny1 = thread->miny1;
957 maxy1 = thread->maxy1;
958 miny2 = thread->miny2;
959 maxy2 = thread->maxy2;
960 x1 = thread->fb_scissor[0];
961 y1 = thread->fb_scissor[1];
962 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
963 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
964 if (y1 < miny1) y1 = miny1;
965 if (y2 > maxy2) y2 = maxy2;
970 // FIXME: honor fb_colormask?
971 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
972 for (i = 0;i < 4;i++)
974 if (!dpsoftrast.fb_colorpixels[i])
976 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
979 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
980 for (x = x1;x < x2;x++)
985 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
987 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
994 DEFCOMMAND(3, ClearDepth, float depth;)
995 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
997 int x1, y1, x2, y2, w, h, x, y;
998 int miny1, maxy1, miny2, maxy2;
1002 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
1003 miny1 = thread->miny1;
1004 maxy1 = thread->maxy1;
1005 miny2 = thread->miny2;
1006 maxy2 = thread->maxy2;
1007 x1 = thread->fb_scissor[0];
1008 y1 = thread->fb_scissor[1];
1009 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
1010 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
1011 if (y1 < miny1) y1 = miny1;
1012 if (y2 > maxy2) y2 = maxy2;
1017 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
1018 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
1019 for (;y < bandy;y++)
1021 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
1022 for (x = x1;x < x2;x++)
1026 void DPSOFTRAST_ClearDepth(float d)
1028 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1032 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1033 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1035 thread->colormask[0] = command->r != 0;
1036 thread->colormask[1] = command->g != 0;
1037 thread->colormask[2] = command->b != 0;
1038 thread->colormask[3] = command->a != 0;
1039 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1041 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1043 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1050 DEFCOMMAND(5, DepthTest, int enable;)
1051 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1053 thread->depthtest = command->enable;
1054 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1056 void DPSOFTRAST_DepthTest(int enable)
1058 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1059 command->enable = enable;
1062 DEFCOMMAND(6, ScissorTest, int enable;)
1063 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1065 thread->scissortest = command->enable;
1066 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1068 void DPSOFTRAST_ScissorTest(int enable)
1070 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1071 command->enable = enable;
1074 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1075 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1077 thread->scissor[0] = command->x;
1078 thread->scissor[1] = command->y;
1079 thread->scissor[2] = command->width;
1080 thread->scissor[3] = command->height;
1081 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1083 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1085 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1088 command->width = width;
1089 command->height = height;
1092 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1093 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1095 thread->blendfunc[0] = command->sfactor;
1096 thread->blendfunc[1] = command->dfactor;
1097 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1099 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1101 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1102 command->sfactor = sfactor;
1103 command->dfactor = dfactor;
1106 DEFCOMMAND(9, BlendSubtract, int enable;)
1107 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1109 thread->blendsubtract = command->enable;
1110 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1112 void DPSOFTRAST_BlendSubtract(int enable)
1114 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1115 command->enable = enable;
1118 DEFCOMMAND(10, DepthMask, int enable;)
1119 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1121 thread->depthmask = command->enable;
1123 void DPSOFTRAST_DepthMask(int enable)
1125 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1126 command->enable = enable;
1129 DEFCOMMAND(11, DepthFunc, int func;)
1130 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1132 thread->depthfunc = command->func;
1134 void DPSOFTRAST_DepthFunc(int func)
1136 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1137 command->func = func;
1140 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1141 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1143 thread->depthrange[0] = command->nearval;
1144 thread->depthrange[1] = command->farval;
1146 void DPSOFTRAST_DepthRange(float nearval, float farval)
1148 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1149 command->nearval = nearval;
1150 command->farval = farval;
1153 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1154 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1156 thread->polygonoffset[0] = command->alongnormal;
1157 thread->polygonoffset[1] = command->intoview;
1159 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1161 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1162 command->alongnormal = alongnormal;
1163 command->intoview = intoview;
1166 DEFCOMMAND(14, CullFace, int mode;)
1167 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1169 thread->cullface = command->mode;
1171 void DPSOFTRAST_CullFace(int mode)
1173 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1174 command->mode = mode;
1177 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1179 dpsoftrast.color[0] = r;
1180 dpsoftrast.color[1] = g;
1181 dpsoftrast.color[2] = b;
1182 dpsoftrast.color[3] = a;
1185 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1187 int outstride = blockwidth * 4;
1188 int instride = dpsoftrast.fb_width * 4;
1191 int bx2 = blockx + blockwidth;
1192 int by2 = blocky + blockheight;
1196 unsigned char *inpixels;
1200 if (bx1 < 0) bx1 = 0;
1201 if (by1 < 0) by1 = 0;
1202 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1203 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1205 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1206 if (dpsoftrast.bigendian)
1208 for (y = by1;y < by2;y++)
1210 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1211 o = (unsigned char *)outpixels + (y - by1) * outstride;
1212 for (x = bx1;x < bx2;x++)
1225 for (y = by1;y < by2;y++)
1227 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1228 o = (unsigned char *)outpixels + (y - by1) * outstride;
1234 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1238 int tx2 = tx + width;
1239 int ty2 = ty + height;
1242 int sx2 = sx + width;
1243 int sy2 = sy + height;
1253 unsigned int *spixels;
1254 unsigned int *tpixels;
1255 DPSOFTRAST_Texture *texture;
1256 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1257 if (mip < 0 || mip >= texture->mipmaps) return;
1259 spixels = dpsoftrast.fb_colorpixels[0];
1260 swidth = dpsoftrast.fb_width;
1261 sheight = dpsoftrast.fb_height;
1262 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1263 twidth = texture->mipmap[mip][2];
1264 theight = texture->mipmap[mip][3];
1265 if (tx1 < 0) tx1 = 0;
1266 if (ty1 < 0) ty1 = 0;
1267 if (tx2 > twidth) tx2 = twidth;
1268 if (ty2 > theight) ty2 = theight;
1269 if (sx1 < 0) sx1 = 0;
1270 if (sy1 < 0) sy1 = 0;
1271 if (sx2 > swidth) sx2 = swidth;
1272 if (sy2 > sheight) sy2 = sheight;
1277 if (tw > sw) tw = sw;
1278 if (th > sh) th = sh;
1279 if (tw < 1 || th < 1)
1281 sy1 = sheight - sy1 - th;
1282 ty1 = theight - ty1 - th;
1283 for (y = 0;y < th;y++)
1284 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1285 if (texture->mipmaps > 1)
1286 DPSOFTRAST_Texture_CalculateMipmaps(index);
1289 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1290 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1292 if (thread->texbound[command->unitnum])
1293 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1294 thread->texbound[command->unitnum] = command->texture;
1296 void DPSOFTRAST_SetTexture(int unitnum, int index)
1298 DPSOFTRAST_Command_SetTexture *command;
1299 DPSOFTRAST_Texture *texture;
1300 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1302 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1305 texture = DPSOFTRAST_Texture_GetByIndex(index);
1306 if (index && !texture)
1308 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1312 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1313 command->unitnum = unitnum;
1314 command->texture = texture;
1316 dpsoftrast.texbound[unitnum] = texture;
1318 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1321 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1323 dpsoftrast.pointer_vertex3f = vertex3f;
1324 dpsoftrast.stride_vertex = stride;
1326 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1328 dpsoftrast.pointer_color4f = color4f;
1329 dpsoftrast.pointer_color4ub = NULL;
1330 dpsoftrast.stride_color = stride;
1332 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1334 dpsoftrast.pointer_color4f = NULL;
1335 dpsoftrast.pointer_color4ub = color4ub;
1336 dpsoftrast.stride_color = stride;
1338 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1340 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1341 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1342 dpsoftrast.stride_texcoord[unitnum] = stride;
1345 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1346 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1348 thread->shader_mode = command->mode;
1349 thread->shader_permutation = command->permutation;
1350 thread->shader_exactspecularmath = command->exactspecularmath;
1352 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1354 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1355 command->mode = mode;
1356 command->permutation = permutation;
1357 command->exactspecularmath = exactspecularmath;
1359 dpsoftrast.shader_mode = mode;
1360 dpsoftrast.shader_permutation = permutation;
1361 dpsoftrast.shader_exactspecularmath = exactspecularmath;
1364 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1365 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1367 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1369 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1371 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1372 command->index = index;
1373 command->val[0] = v0;
1374 command->val[1] = v1;
1375 command->val[2] = v2;
1376 command->val[3] = v3;
1378 dpsoftrast.uniform4f[index*4+0] = v0;
1379 dpsoftrast.uniform4f[index*4+1] = v1;
1380 dpsoftrast.uniform4f[index*4+2] = v2;
1381 dpsoftrast.uniform4f[index*4+3] = v3;
1383 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1385 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1386 command->index = index;
1387 memcpy(command->val, v, sizeof(command->val));
1389 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1392 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1393 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1395 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1397 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1401 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1403 __m128 m0, m1, m2, m3;
1404 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1405 command->index = (DPSOFTRAST_UNIFORM)index;
1406 if (((size_t)v)&(ALIGN_SIZE-1))
1408 m0 = _mm_loadu_ps(v);
1409 m1 = _mm_loadu_ps(v+4);
1410 m2 = _mm_loadu_ps(v+8);
1411 m3 = _mm_loadu_ps(v+12);
1415 m0 = _mm_load_ps(v);
1416 m1 = _mm_load_ps(v+4);
1417 m2 = _mm_load_ps(v+8);
1418 m3 = _mm_load_ps(v+12);
1422 __m128 t0, t1, t2, t3;
1423 t0 = _mm_unpacklo_ps(m0, m1);
1424 t1 = _mm_unpacklo_ps(m2, m3);
1425 t2 = _mm_unpackhi_ps(m0, m1);
1426 t3 = _mm_unpackhi_ps(m2, m3);
1427 m0 = _mm_movelh_ps(t0, t1);
1428 m1 = _mm_movehl_ps(t1, t0);
1429 m2 = _mm_movelh_ps(t2, t3);
1430 m3 = _mm_movehl_ps(t3, t2);
1432 _mm_store_ps(command->val, m0);
1433 _mm_store_ps(command->val+4, m1);
1434 _mm_store_ps(command->val+8, m2);
1435 _mm_store_ps(command->val+12, m3);
1436 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1437 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1438 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1439 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1444 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1445 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1447 thread->uniform1i[command->index] = command->val;
1449 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1451 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1452 command->index = index;
1455 dpsoftrast.uniform1i[command->index] = i0;
1458 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1459 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1461 memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1462 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1464 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1466 DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1467 command->clipplane[0] = x;
1468 command->clipplane[1] = y;
1469 command->clipplane[2] = z;
1470 command->clipplane[3] = w;
1474 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1476 float *end = dst + size*4;
1477 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1481 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1490 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1497 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1499 float *end = dst + size*4;
1500 if (stride == sizeof(float[3]))
1502 float *end4 = dst + (size&~3)*4;
1503 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1507 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1508 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1509 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1510 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1511 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1512 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1513 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1514 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1515 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1516 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1517 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1518 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1519 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1521 src += 4*sizeof(float[3]);
1528 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1529 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1530 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1531 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1532 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1533 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1534 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1535 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1536 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1537 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1538 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1539 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1540 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1542 src += 4*sizeof(float[3]);
1546 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1550 __m128 v = _mm_loadu_ps((const float *)src);
1551 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1552 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1553 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1554 _mm_store_ps(dst, v);
1563 __m128 v = _mm_load_ps((const float *)src);
1564 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1565 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1566 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1567 _mm_store_ps(dst, v);
1574 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1576 float *end = dst + size*4;
1577 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1578 if (stride == sizeof(float[2]))
1580 float *end2 = dst + (size&~1)*4;
1581 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1585 __m128 v = _mm_loadu_ps((const float *)src);
1586 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1587 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1589 src += 2*sizeof(float[2]);
1596 __m128 v = _mm_load_ps((const float *)src);
1597 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1598 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1600 src += 2*sizeof(float[2]);
1606 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1612 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1614 float *end = dst + size*4;
1615 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1616 if (stride == sizeof(unsigned char[4]))
1618 float *end4 = dst + (size&~3)*4;
1619 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1623 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1624 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1625 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1626 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1627 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1629 src += 4*sizeof(unsigned char[4]);
1636 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1637 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1638 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1639 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1640 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1642 src += 4*sizeof(unsigned char[4]);
1648 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1649 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1655 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1657 float *end = dst + 4*size;
1658 __m128 v = _mm_loadu_ps(src);
1661 _mm_store_ps(dst, v);
1667 static void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1670 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1671 __m128 m0, m1, m2, m3;
1673 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1675 // fast case for identity matrix
1676 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1679 end = out4f + numitems*4;
1680 m0 = _mm_loadu_ps(inmatrix16f);
1681 m1 = _mm_loadu_ps(inmatrix16f + 4);
1682 m2 = _mm_loadu_ps(inmatrix16f + 8);
1683 m3 = _mm_loadu_ps(inmatrix16f + 12);
1684 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1688 __m128 v = _mm_loadu_ps(in4f);
1690 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1691 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1692 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1693 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1702 __m128 v = _mm_load_ps(in4f);
1704 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1705 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1706 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1707 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1716 static void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1718 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1723 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1725 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1726 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1727 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1728 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1731 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1733 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1734 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1735 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1736 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1739 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1742 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1743 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1744 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1745 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1748 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1750 int clipmask = 0xFF;
1751 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1752 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1753 __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1754 __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1755 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1756 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1757 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1758 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1759 #define BBFRONT(k, pos) \
1761 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1762 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1763 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1766 clipmask &= ~(1<<k); \
1767 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1768 minproj = _mm_min_ss(minproj, proj); \
1769 maxproj = _mm_max_ss(maxproj, proj); \
1773 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1774 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1775 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1776 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1777 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1778 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1782 if (clipmask&(1<<k)) \
1784 if (!(clipmask&(1<<(k^1)))) \
1786 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1787 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1788 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1789 minproj = _mm_min_ss(minproj, proj); \
1790 maxproj = _mm_max_ss(maxproj, proj); \
1792 if (!(clipmask&(1<<(k^2)))) \
1794 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1795 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1796 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1797 minproj = _mm_min_ss(minproj, proj); \
1798 maxproj = _mm_max_ss(maxproj, proj); \
1800 if (!(clipmask&(1<<(k^4)))) \
1802 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1803 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1804 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1805 minproj = _mm_min_ss(minproj, proj); \
1806 maxproj = _mm_max_ss(maxproj, proj); \
1810 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1811 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1812 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1813 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1814 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1815 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1816 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1817 *starty = _mm_cvttss_si32(maxproj);
1818 *endy = _mm_cvttss_si32(minproj)+1;
1822 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1824 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1825 float *end = out4f + numitems*4;
1826 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1827 __m128 minpos, maxpos;
1828 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1830 minpos = maxpos = _mm_loadu_ps(in4f);
1833 __m128 v = _mm_loadu_ps(in4f);
1834 minpos = _mm_min_ps(minpos, v);
1835 maxpos = _mm_max_ps(maxpos, v);
1836 _mm_store_ps(out4f, v);
1837 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1838 _mm_store_ps(screen4f, v);
1846 minpos = maxpos = _mm_load_ps(in4f);
1849 __m128 v = _mm_load_ps(in4f);
1850 minpos = _mm_min_ps(minpos, v);
1851 maxpos = _mm_max_ps(maxpos, v);
1852 _mm_store_ps(out4f, v);
1853 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1854 _mm_store_ps(screen4f, v);
1862 ALIGN(float minposf[4]);
1863 ALIGN(float maxposf[4]);
1864 _mm_store_ps(minposf, minpos);
1865 _mm_store_ps(maxposf, maxpos);
1866 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1871 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1873 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1874 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1876 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1877 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1878 end = out4f + numitems*4;
1879 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1880 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1881 m0 = _mm_loadu_ps(inmatrix16f);
1882 m1 = _mm_loadu_ps(inmatrix16f + 4);
1883 m2 = _mm_loadu_ps(inmatrix16f + 8);
1884 m3 = _mm_loadu_ps(inmatrix16f + 12);
1885 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1887 minpos = maxpos = _mm_loadu_ps(in4f);
1890 __m128 v = _mm_loadu_ps(in4f);
1891 minpos = _mm_min_ps(minpos, v);
1892 maxpos = _mm_max_ps(maxpos, v);
1893 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1894 _mm_store_ps(out4f, v);
1895 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1896 _mm_store_ps(screen4f, v);
1904 minpos = maxpos = _mm_load_ps(in4f);
1907 __m128 v = _mm_load_ps(in4f);
1908 minpos = _mm_min_ps(minpos, v);
1909 maxpos = _mm_max_ps(maxpos, v);
1910 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1911 _mm_store_ps(out4f, v);
1912 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1913 _mm_store_ps(screen4f, v);
1921 ALIGN(float minposf[4]);
1922 ALIGN(float maxposf[4]);
1923 _mm_store_ps(minposf, minpos);
1924 _mm_store_ps(maxposf, maxpos);
1925 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f);
1931 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1934 float *outf = dpsoftrast.post_array4f[outarray];
1935 const unsigned char *inb;
1936 int firstvertex = dpsoftrast.firstvertex;
1937 int numvertices = dpsoftrast.numvertices;
1941 case DPSOFTRAST_ARRAY_POSITION:
1942 stride = dpsoftrast.stride_vertex;
1943 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1944 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1946 case DPSOFTRAST_ARRAY_COLOR:
1947 stride = dpsoftrast.stride_color;
1948 if (dpsoftrast.pointer_color4f)
1950 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1951 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1953 else if (dpsoftrast.pointer_color4ub)
1955 stride = dpsoftrast.stride_color;
1956 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1957 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1961 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1965 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1966 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1968 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1969 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1972 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1975 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1978 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1990 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1992 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1993 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1998 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
2001 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2002 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
2010 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2013 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2014 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2021 static void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2024 int startx = span->startx;
2025 int endx = span->endx;
2026 float wslope = triangle->w[0];
2027 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2028 float endz = 1.0f / (w + wslope * startx);
2029 if (triangle->w[0] == 0)
2031 // LordHavoc: fast flat polygons (HUD/menu)
2032 for (x = startx;x < endx;x++)
2036 for (x = startx;x < endx;)
2038 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2040 if (nextsub >= endx) nextsub = endsub = endx-1;
2041 endz = 1.0f / (w + wslope * nextsub);
2042 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2043 for (; x <= endsub; x++, z += dz)
2048 static void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2052 int startx = span->startx;
2053 int endx = span->endx;
2056 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2057 unsigned char * RESTRICT pixelmask = span->pixelmask;
2058 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2059 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2062 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2063 pixeli += span->y * dpsoftrast.fb_width + span->x;
2064 // handle alphatest now (this affects depth writes too)
2065 if (thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL)
2066 for (x = startx;x < endx;x++)
2067 if (in4ub[x*4+3] < 128)
2068 pixelmask[x] = false;
2069 // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2070 // helps sprites, text and hud artwork
2071 switch(thread->fb_blendmode)
2073 case DPSOFTRAST_BLENDMODE_ALPHA:
2074 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2075 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2077 for (x = startx;x < endx;x++)
2079 if (in4ub[x*4+3] >= 1)
2084 while (++x < endx && in4ub[x*4+3] >= 1) ;
2086 if (x >= endx) break;
2088 while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2089 if (x >= endx) break;
2096 case DPSOFTRAST_BLENDMODE_OPAQUE:
2097 case DPSOFTRAST_BLENDMODE_ADD:
2098 case DPSOFTRAST_BLENDMODE_INVMOD:
2099 case DPSOFTRAST_BLENDMODE_MUL:
2100 case DPSOFTRAST_BLENDMODE_MUL2:
2101 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2102 case DPSOFTRAST_BLENDMODE_INVADD:
2105 // put some special values at the end of the mask to ensure the loops end
2106 pixelmask[endx] = 1;
2107 pixelmask[endx+1] = 0;
2108 // LordHavoc: use a double loop to identify subspans, this helps the
2109 // optimized copy/blend loops to perform at their best, most triangles
2110 // have only one run of pixels, and do the search using wide reads...
2114 // if this pixel is masked off, it's probably not alone...
2121 // the 4-item search must be aligned or else it stalls badly
2122 if ((x & 3) && !pixelmask[x])
2124 if(pixelmask[x]) goto endmasked;
2128 if(pixelmask[x]) goto endmasked;
2132 if(pixelmask[x]) goto endmasked;
2137 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2141 for (;!pixelmask[x];x++)
2143 // rather than continue the loop, just check the end variable
2148 // find length of subspan
2151 if (subx + 8 < endx)
2155 if(!pixelmask[subx]) goto endunmasked;
2159 if(!pixelmask[subx]) goto endunmasked;
2163 if(!pixelmask[subx]) goto endunmasked;
2168 while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2172 for (;pixelmask[subx];subx++)
2174 // the checks can overshoot, so make sure to clip it...
2178 // now that we know the subspan length... process!
2179 switch(thread->fb_blendmode)
2181 case DPSOFTRAST_BLENDMODE_OPAQUE:
2185 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2190 while (x + 16 <= subx)
2192 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2193 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2194 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2195 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2200 while (x + 4 <= subx)
2202 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2208 pixeli[x+1] = ini[x+1];
2218 case DPSOFTRAST_BLENDMODE_ALPHA:
2219 #define FINISHBLEND(blend2, blend1) \
2220 for (;x + 1 < subx;x += 2) \
2223 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2224 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2226 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2231 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2232 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2234 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2238 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2239 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2241 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2242 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2245 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2247 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2248 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2250 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2251 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2254 case DPSOFTRAST_BLENDMODE_ADD:
2255 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2257 case DPSOFTRAST_BLENDMODE_INVMOD:
2259 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2261 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2264 case DPSOFTRAST_BLENDMODE_MUL:
2265 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2267 case DPSOFTRAST_BLENDMODE_MUL2:
2268 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2270 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2272 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2273 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2275 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2276 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2279 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2281 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2282 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2284 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2285 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2288 case DPSOFTRAST_BLENDMODE_INVADD:
2290 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2292 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2300 static void DPSOFTRAST_Texture2DBGRA8(DPSOFTRAST_Texture *texture, int mip, float x, float y, unsigned char c[4])
2301 // warning: this is SLOW, only use if the optimized per-span functions won't do
2303 const unsigned char * RESTRICT pixelbase;
2304 const unsigned char * RESTRICT pixel[4];
2305 int width = texture->mipmap[mip][2], height = texture->mipmap[mip][3];
2306 int wrapmask[2] = { width-1, height-1 };
2307 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*width;
2308 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
2310 unsigned int tc[2] = { x * (width<<12) - 2048, y * (height<<12) - 2048};
2311 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2312 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2313 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2314 int tci[2] = { tc[0]>>12, tc[1]>>12 };
2315 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2316 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2318 tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2319 tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2320 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= wrapmask[0] ? tci1[0] : wrapmask[0]) : 0;
2321 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= wrapmask[1] ? tci1[1] : wrapmask[1]) : 0;
2325 tci[0] &= wrapmask[0];
2326 tci[1] &= wrapmask[1];
2327 tci1[0] &= wrapmask[0];
2328 tci1[1] &= wrapmask[1];
2330 pixel[0] = pixelbase + 4 * (tci[0] - tci[1]*width);
2331 pixel[1] = pixelbase + 4 * (tci[0] - tci[1]*width);
2332 pixel[2] = pixelbase + 4 * (tci[0] - tci1[1]*width);
2333 pixel[3] = pixelbase + 4 * (tci[0] - tci1[1]*width);
2334 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
2335 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
2336 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
2337 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
2341 int tci[2] = { x * width, y * height };
2342 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2344 tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2345 tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2349 tci[0] &= wrapmask[0];
2350 tci[1] &= wrapmask[1];
2352 pixel[0] = pixelbase + 4 * (tci[0] - tci[1]*width);
2361 static void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2364 int startx = span->startx;
2365 int endx = span->endx;
2370 float tc[2], endtc[2];
2372 unsigned int tci[2];
2373 unsigned int tci1[2];
2374 unsigned int tcimin[2];
2375 unsigned int tcimax[2];
2380 const unsigned char * RESTRICT pixelbase;
2381 const unsigned char * RESTRICT pixel[4];
2382 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2383 // if no texture is bound, just fill it with white
2386 for (x = startx;x < endx;x++)
2388 out4f[x*4+0] = 1.0f;
2389 out4f[x*4+1] = 1.0f;
2390 out4f[x*4+2] = 1.0f;
2391 out4f[x*4+3] = 1.0f;
2395 mip = triangle->mip[texunitindex];
2396 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*texture->mipmap[mip][2];
2397 // if this mipmap of the texture is 1 pixel, just fill it with that color
2398 if (texture->mipmap[mip][1] == 4)
2400 c[0] = texture->bytes[2] * (1.0f/255.0f);
2401 c[1] = texture->bytes[1] * (1.0f/255.0f);
2402 c[2] = texture->bytes[0] * (1.0f/255.0f);
2403 c[3] = texture->bytes[3] * (1.0f/255.0f);
2404 for (x = startx;x < endx;x++)
2406 out4f[x*4+0] = c[0];
2407 out4f[x*4+1] = c[1];
2408 out4f[x*4+2] = c[2];
2409 out4f[x*4+3] = c[3];
2413 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2414 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2415 flags = texture->flags;
2416 tcscale[0] = texture->mipmap[mip][2];
2417 tcscale[1] = texture->mipmap[mip][3];
2418 tciwidth = -texture->mipmap[mip][2];
2421 tcimax[0] = texture->mipmap[mip][2]-1;
2422 tcimax[1] = texture->mipmap[mip][3]-1;
2423 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2424 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2425 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2426 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2432 for (x = startx;x < endx;)
2434 unsigned int subtc[2];
2435 unsigned int substep[2];
2436 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2437 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2438 if (nextsub >= endx)
2440 nextsub = endsub = endx-1;
2441 if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2445 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2446 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2452 substep[0] = (endtc[0] - tc[0]) * subscale;
2453 substep[1] = (endtc[1] - tc[1]) * subscale;
2454 subtc[0] = tc[0] * (1<<12);
2455 subtc[1] = tc[1] * (1<<12);
2458 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2460 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2462 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2463 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2464 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2465 tci[0] = subtc[0]>>12;
2466 tci[1] = subtc[1]>>12;
2467 tci1[0] = tci[0] + 1;
2468 tci1[1] = tci[1] + 1;
2469 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2470 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2471 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2472 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2473 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2474 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2475 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2476 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2477 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2478 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2479 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2480 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2481 out4f[x*4+0] = c[0];
2482 out4f[x*4+1] = c[1];
2483 out4f[x*4+2] = c[2];
2484 out4f[x*4+3] = c[3];
2489 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2491 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2492 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2493 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2494 tci[0] = subtc[0]>>12;
2495 tci[1] = subtc[1]>>12;
2496 tci1[0] = tci[0] + 1;
2497 tci1[1] = tci[1] + 1;
2498 tci[0] &= tciwrapmask[0];
2499 tci[1] &= tciwrapmask[1];
2500 tci1[0] &= tciwrapmask[0];
2501 tci1[1] &= tciwrapmask[1];
2502 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2503 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2504 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2505 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2506 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2507 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2508 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2509 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2510 out4f[x*4+0] = c[0];
2511 out4f[x*4+1] = c[1];
2512 out4f[x*4+2] = c[2];
2513 out4f[x*4+3] = c[3];
2517 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2519 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2521 tci[0] = subtc[0]>>12;
2522 tci[1] = subtc[1]>>12;
2523 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2524 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2525 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2526 c[0] = pixel[0][2] * (1.0f / 255.0f);
2527 c[1] = pixel[0][1] * (1.0f / 255.0f);
2528 c[2] = pixel[0][0] * (1.0f / 255.0f);
2529 c[3] = pixel[0][3] * (1.0f / 255.0f);
2530 out4f[x*4+0] = c[0];
2531 out4f[x*4+1] = c[1];
2532 out4f[x*4+2] = c[2];
2533 out4f[x*4+3] = c[3];
2538 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2540 tci[0] = subtc[0]>>12;
2541 tci[1] = subtc[1]>>12;
2542 tci[0] &= tciwrapmask[0];
2543 tci[1] &= tciwrapmask[1];
2544 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2545 c[0] = pixel[0][2] * (1.0f / 255.0f);
2546 c[1] = pixel[0][1] * (1.0f / 255.0f);
2547 c[2] = pixel[0][0] * (1.0f / 255.0f);
2548 c[3] = pixel[0][3] * (1.0f / 255.0f);
2549 out4f[x*4+0] = c[0];
2550 out4f[x*4+1] = c[1];
2551 out4f[x*4+2] = c[2];
2552 out4f[x*4+3] = c[3];
2559 static void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2563 int startx = span->startx;
2564 int endx = span->endx;
2566 __m128 data, slope, tcscale;
2567 __m128i tcsize, tcmask, tcoffset, tcmax;
2569 __m128i subtc, substep, endsubtc;
2572 int affine; // LordHavoc: optimized affine texturing case
2573 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2574 const unsigned char * RESTRICT pixelbase;
2575 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2576 // if no texture is bound, just fill it with white
2579 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2582 mip = triangle->mip[texunitindex];
2583 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*texture->mipmap[mip][2];
2584 // if this mipmap of the texture is 1 pixel, just fill it with that color
2585 if (texture->mipmap[mip][1] == 4)
2587 unsigned int k = *((const unsigned int *)pixelbase);
2588 for (x = startx;x < endx;x++)
2592 affine = zf[startx] == zf[endx-1];
2593 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2594 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2595 flags = texture->flags;
2596 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2597 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2598 tcscale = _mm_cvtepi32_ps(tcsize);
2599 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2600 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2601 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2603 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2604 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2605 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_sub_epi32(_mm_setzero_si128(), _mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0))), 18), _mm_set1_epi32(4));
2606 tcmax = _mm_packs_epi32(tcmask, tcmask);
2607 for (x = startx;x < endx;)
2609 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2610 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2611 if (nextsub >= endx || affine)
2613 nextsub = endsub = endx-1;
2614 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2618 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2620 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2621 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2622 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2623 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2624 substep = _mm_slli_epi32(substep, 1);
2627 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2628 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2630 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2631 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2633 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2634 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2635 tci = _mm_madd_epi16(tci, tcoffset);
2636 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2637 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2638 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2639 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2640 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2641 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2642 fracm = _mm_srli_epi16(subtc, 1);
2643 pix1 = _mm_add_epi16(pix1,
2644 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2645 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2646 pix3 = _mm_add_epi16(pix3,
2647 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2648 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2649 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2650 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2651 pix2 = _mm_add_epi16(pix2,
2652 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2653 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2654 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2658 const unsigned char * RESTRICT ptr1;
2659 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2660 tci = _mm_madd_epi16(tci, tcoffset);
2661 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2662 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2663 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2664 fracm = _mm_srli_epi16(subtc, 1);
2665 pix1 = _mm_add_epi16(pix1,
2666 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2667 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2668 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2669 pix1 = _mm_add_epi16(pix1,
2670 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2671 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2672 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2676 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2678 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2680 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2681 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2682 tci = _mm_madd_epi16(tci, tcoffset);
2683 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2684 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2685 _mm_setzero_si128());
2686 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2687 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2688 _mm_setzero_si128());
2689 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2690 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2691 tci = _mm_madd_epi16(tci, tcoffset);
2692 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2693 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2694 _mm_setzero_si128());
2695 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2696 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2697 _mm_setzero_si128());
2698 fracm = _mm_srli_epi16(subtc, 1);
2699 pix1 = _mm_add_epi16(pix1,
2700 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2701 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2702 pix3 = _mm_add_epi16(pix3,
2703 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2704 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2705 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2706 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2707 pix2 = _mm_add_epi16(pix2,
2708 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2709 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2710 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2714 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2715 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2716 tci = _mm_madd_epi16(tci, tcoffset);
2717 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2718 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2719 _mm_setzero_si128());
2720 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2721 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2722 _mm_setzero_si128());
2723 fracm = _mm_srli_epi16(subtc, 1);
2724 pix1 = _mm_add_epi16(pix1,
2725 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2726 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2727 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2728 pix1 = _mm_add_epi16(pix1,
2729 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2730 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2731 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2737 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2739 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2740 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2741 tci = _mm_madd_epi16(tci, tcoffset);
2742 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2743 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2744 _mm_setzero_si128());
2745 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2746 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2747 _mm_setzero_si128());
2748 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2749 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2750 tci = _mm_madd_epi16(tci, tcoffset);
2751 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2752 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2753 _mm_setzero_si128());
2754 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2755 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2756 _mm_setzero_si128());
2757 fracm = _mm_srli_epi16(subtc, 1);
2758 pix1 = _mm_add_epi16(pix1,
2759 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2760 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2761 pix3 = _mm_add_epi16(pix3,
2762 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2763 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2764 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2765 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2766 pix2 = _mm_add_epi16(pix2,
2767 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2768 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2769 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2773 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2774 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2775 tci = _mm_madd_epi16(tci, tcoffset);
2776 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2777 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2778 _mm_setzero_si128());
2779 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2780 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2781 _mm_setzero_si128());
2782 fracm = _mm_srli_epi16(subtc, 1);
2783 pix1 = _mm_add_epi16(pix1,
2784 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2785 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2786 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2787 pix1 = _mm_add_epi16(pix1,
2788 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2789 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2790 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2797 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2799 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2801 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2802 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2803 tci = _mm_madd_epi16(tci, tcoffset);
2804 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2805 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2809 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2810 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2811 tci = _mm_madd_epi16(tci, tcoffset);
2812 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2818 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2820 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2821 tci = _mm_and_si128(tci, tcmax);
2822 tci = _mm_madd_epi16(tci, tcoffset);
2823 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2824 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2828 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2829 tci = _mm_and_si128(tci, tcmax);
2830 tci = _mm_madd_epi16(tci, tcoffset);
2831 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2840 static void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2843 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2846 static float DPSOFTRAST_SampleShadowmap(const float *vector)
2853 static void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2856 int startx = span->startx;
2857 int endx = span->endx;
2862 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2863 for (x = startx;x < endx;x++)
2866 c[0] = (data[0] + slope[0]*x) * z;
2867 c[1] = (data[1] + slope[1]*x) * z;
2868 c[2] = (data[2] + slope[2]*x) * z;
2869 c[3] = (data[3] + slope[3]*x) * z;
2870 out4f[x*4+0] = in4f[x*4+0] * c[0];
2871 out4f[x*4+1] = in4f[x*4+1] * c[1];
2872 out4f[x*4+2] = in4f[x*4+2] * c[2];
2873 out4f[x*4+3] = in4f[x*4+3] * c[3];
2879 static void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2882 int startx = span->startx;
2883 int endx = span->endx;
2888 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2889 for (x = startx;x < endx;x++)
2892 c[0] = (data[0] + slope[0]*x) * z;
2893 c[1] = (data[1] + slope[1]*x) * z;
2894 c[2] = (data[2] + slope[2]*x) * z;
2895 c[3] = (data[3] + slope[3]*x) * z;
2896 out4f[x*4+0] = c[0];
2897 out4f[x*4+1] = c[1];
2898 out4f[x*4+2] = c[2];
2899 out4f[x*4+3] = c[3];
2905 static void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2907 int x, startx = span->startx, endx = span->endx;
2908 float c[4], localcolor[4];
2909 localcolor[0] = subcolor[0];
2910 localcolor[1] = subcolor[1];
2911 localcolor[2] = subcolor[2];
2912 localcolor[3] = subcolor[3];
2913 for (x = startx;x < endx;x++)
2915 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2916 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2917 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2918 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2919 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2920 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2921 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2922 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2928 static void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2930 int x, startx = span->startx, endx = span->endx;
2931 for (x = startx;x < endx;x++)
2933 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2934 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2935 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2936 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2942 static void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2944 int x, startx = span->startx, endx = span->endx;
2945 for (x = startx;x < endx;x++)
2947 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2948 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2949 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2950 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2956 static void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2958 int x, startx = span->startx, endx = span->endx;
2960 for (x = startx;x < endx;x++)
2962 a = 1.0f - inb4f[x*4+3];
2964 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2965 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2966 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2967 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2973 static void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2975 int x, startx = span->startx, endx = span->endx;
2976 float localcolor[4], ilerp, lerp;
2977 localcolor[0] = color[0];
2978 localcolor[1] = color[1];
2979 localcolor[2] = color[2];
2980 localcolor[3] = color[3];
2981 ilerp = 1.0f - localcolor[3];
2982 lerp = localcolor[3];
2983 for (x = startx;x < endx;x++)
2985 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2986 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2987 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2988 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2995 static void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2999 int startx = span->startx;
3000 int endx = span->endx;
3003 __m128i submod, substep, endsubmod;
3004 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3005 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3006 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3007 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3008 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3009 for (x = startx; x < endx;)
3011 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3012 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3013 if (nextsub >= endx)
3015 nextsub = endsub = endx-1;
3016 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
3020 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3021 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3022 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3023 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3024 substep = _mm_packs_epi32(substep, substep);
3025 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3027 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3028 pix = _mm_mulhi_epu16(pix, submod);
3029 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3033 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3034 pix = _mm_mulhi_epu16(pix, submod);
3035 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3042 static void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3046 int startx = span->startx;
3047 int endx = span->endx;
3050 __m128i submod, substep, endsubmod;
3051 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3052 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3053 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3054 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3055 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3056 for (x = startx; x < endx;)
3058 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3059 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3060 if (nextsub >= endx)
3062 nextsub = endsub = endx-1;
3063 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3067 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3068 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3069 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3070 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3071 substep = _mm_packs_epi32(substep, substep);
3072 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3074 __m128i pix = _mm_srai_epi16(submod, 4);
3075 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3079 __m128i pix = _mm_srai_epi16(submod, 4);
3080 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3087 static void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3090 int x, startx = span->startx, endx = span->endx;
3091 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3092 localcolor = _mm_packs_epi32(localcolor, localcolor);
3093 for (x = startx;x+2 <= endx;x+=2)
3095 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3096 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3097 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3098 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3102 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3103 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3104 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3105 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3110 static void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3113 int x, startx = span->startx, endx = span->endx;
3114 for (x = startx;x+2 <= endx;x+=2)
3116 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3117 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3118 pix1 = _mm_mulhi_epu16(pix1, pix2);
3119 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3123 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3124 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3125 pix1 = _mm_mulhi_epu16(pix1, pix2);
3126 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3131 static void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3134 int x, startx = span->startx, endx = span->endx;
3135 for (x = startx;x+2 <= endx;x+=2)
3137 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3138 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3139 pix1 = _mm_add_epi16(pix1, pix2);
3140 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3144 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3145 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3146 pix1 = _mm_add_epi16(pix1, pix2);
3147 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3153 static void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3156 int x, startx = span->startx, endx = span->endx;
3157 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3158 tint = _mm_packs_epi32(tint, tint);
3159 for (x = startx;x+2 <= endx;x+=2)
3161 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3162 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3163 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3164 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3168 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3169 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3170 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3171 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3177 static void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3180 int x, startx = span->startx, endx = span->endx;
3181 for (x = startx;x+2 <= endx;x+=2)
3183 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3184 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3185 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3186 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3187 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3191 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3192 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3193 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3194 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3195 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3200 static void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3203 int x, startx = span->startx, endx = span->endx;
3204 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3205 localcolor = _mm_packs_epi32(localcolor, localcolor);
3206 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3207 for (x = startx;x+2 <= endx;x+=2)
3209 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3210 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3211 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3215 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3216 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3217 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3224 static void DPSOFTRAST_VertexShader_Generic(void)
3226 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3227 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3228 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3229 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3230 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3233 static void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3235 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3236 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3237 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3238 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3239 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3240 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3242 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3243 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3244 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3246 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3247 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3250 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3252 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3255 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3257 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3260 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3265 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3266 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3271 static void DPSOFTRAST_VertexShader_PostProcess(void)
3273 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3274 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3275 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3278 static void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3280 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3281 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3282 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3283 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3284 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3285 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3286 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3288 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3289 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3291 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3292 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3294 // TODO: implement saturation
3296 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3298 // TODO: implement gammaramps
3300 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3305 static void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3307 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3310 static void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3312 // this is never called (because colormask is off when this shader is used)
3313 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3314 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3315 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3316 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3317 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3322 static void DPSOFTRAST_VertexShader_FlatColor(void)
3324 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3325 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3328 static void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3331 unsigned char * RESTRICT pixelmask = span->pixelmask;
3332 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3333 int x, startx = span->startx, endx = span->endx;
3334 __m128i Color_Ambientm;
3335 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3336 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3337 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3338 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3339 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3340 if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3341 pixel = buffer_FragColorbgra8;
3342 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3343 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3344 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3345 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3346 for (x = startx;x < endx;x++)
3349 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3352 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3353 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3354 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3355 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3361 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3362 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3363 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3365 if (pixel == buffer_FragColorbgra8)
3366 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3372 static void DPSOFTRAST_VertexShader_VertexColor(void)
3374 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3375 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3376 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3379 static void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3382 unsigned char * RESTRICT pixelmask = span->pixelmask;
3383 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3384 int x, startx = span->startx, endx = span->endx;
3385 __m128i Color_Ambientm, Color_Diffusem;
3387 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3388 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3389 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3390 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3391 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3392 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3393 if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3394 pixel = buffer_FragColorbgra8;
3395 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3396 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3397 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3398 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3399 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3400 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3401 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3402 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3403 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3404 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3405 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3406 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3407 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3408 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3410 __m128i color, mod, pix;
3411 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3414 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3415 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3416 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3417 data = _mm_add_ps(data, slope);
3418 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3419 data = _mm_add_ps(data, slope);
3420 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3421 data = _mm_add_ps(data, slope);
3422 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3423 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3424 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3425 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3426 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3427 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3433 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3434 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3435 mod = _mm_packs_epi32(mod, mod);
3436 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3437 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3439 if (pixel == buffer_FragColorbgra8)
3440 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3446 static void DPSOFTRAST_VertexShader_Lightmap(void)
3448 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3449 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3450 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3453 static void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3456 unsigned char * RESTRICT pixelmask = span->pixelmask;
3457 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3458 int x, startx = span->startx, endx = span->endx;
3459 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3460 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3461 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3462 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3463 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3464 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3465 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3466 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3467 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3468 if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3469 pixel = buffer_FragColorbgra8;
3470 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3471 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3472 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3473 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3474 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3475 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3476 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3477 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3479 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3480 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3481 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3482 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3483 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3484 for (x = startx;x < endx;x++)
3486 __m128i color, lightmap, glow, pix;
3487 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3490 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3491 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3492 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3493 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3494 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3495 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3496 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3497 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3498 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3499 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3505 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3506 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3507 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3508 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3509 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3510 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3515 for (x = startx;x < endx;x++)
3517 __m128i color, lightmap, pix;
3518 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3521 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3522 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3523 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3524 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3525 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3526 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3527 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3533 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3534 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3535 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3536 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3539 if (pixel == buffer_FragColorbgra8)
3540 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3545 void DPSOFTRAST_VertexShader_LightDirection(void);
3546 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3548 static void DPSOFTRAST_VertexShader_FakeLight(void)
3550 DPSOFTRAST_VertexShader_LightDirection();
3553 static void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3555 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3560 static void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3562 DPSOFTRAST_VertexShader_LightDirection();
3563 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3566 static void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3568 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3573 static void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3575 DPSOFTRAST_VertexShader_LightDirection();
3576 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3579 static void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3581 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3586 void DPSOFTRAST_VertexShader_LightDirection(void)
3589 int numvertices = dpsoftrast.numvertices;
3591 float LightVector[4];
3592 float EyePosition[4];
3593 float EyeVectorModelSpace[4];
3599 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3600 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3601 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3602 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3603 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3604 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3605 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3606 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3607 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3608 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3609 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3610 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3611 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3612 for (i = 0;i < numvertices;i++)
3614 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3615 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3616 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3617 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3618 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3619 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3620 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3621 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3622 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3623 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3624 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3625 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3626 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3627 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3628 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3629 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3630 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3631 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3632 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3633 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3634 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3635 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3636 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3637 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3638 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3639 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3640 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3641 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3642 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3644 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3647 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3648 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3649 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3650 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3651 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3652 #define DPSOFTRAST_Vector3Normalize(v)\
3655 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3666 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3668 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3669 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3670 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3671 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3672 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3673 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3674 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3675 unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3676 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3677 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3678 int x, startx = span->startx, endx = span->endx;
3679 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3680 float LightVectordata[4];
3681 float LightVectorslope[4];
3682 float EyeVectordata[4];
3683 float EyeVectorslope[4];
3684 float VectorSdata[4];
3685 float VectorSslope[4];
3686 float VectorTdata[4];
3687 float VectorTslope[4];
3688 float VectorRdata[4];
3689 float VectorRslope[4];
3691 float diffusetex[4];
3693 float surfacenormal[4];
3694 float lightnormal[4];
3695 float lightnormal_modelspace[4];
3697 float specularnormal[4];
3700 float SpecularPower;
3702 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3703 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3704 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3705 Color_Glow[3] = 0.0f;
3706 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3707 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3708 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3709 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3710 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3711 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3712 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3713 Color_Pants[3] = 0.0f;
3714 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3715 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3716 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3717 Color_Shirt[3] = 0.0f;
3718 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3719 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3720 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3722 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3723 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3725 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3727 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3729 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3731 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3732 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3733 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3734 Color_Diffuse[3] = 0.0f;
3735 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3736 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3737 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3738 LightColor[3] = 0.0f;
3739 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3740 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3741 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3742 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3743 Color_Specular[3] = 0.0f;
3744 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3745 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3746 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3748 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3750 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3751 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3752 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3753 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3754 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3756 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3758 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3759 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3761 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3763 // nothing of this needed
3767 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3770 for (x = startx;x < endx;x++)
3773 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3774 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3775 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3776 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3777 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3779 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3780 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3781 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3782 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3784 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3785 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3786 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3787 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3788 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3789 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3790 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3791 DPSOFTRAST_Vector3Normalize(surfacenormal);
3793 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3795 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3796 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3797 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3798 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3800 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3801 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3802 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3803 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3805 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3806 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3807 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3808 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3810 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3811 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3812 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3813 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3815 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3816 DPSOFTRAST_Vector3Normalize(lightnormal);
3818 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3820 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3821 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3822 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3823 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3826 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3828 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3829 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3830 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3832 float f = 1.0f / 256.0f;
3833 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3834 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3835 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3838 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3840 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3841 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3842 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3843 DPSOFTRAST_Vector3Normalize(lightnormal);
3845 LightColor[0] = 1.0;
3846 LightColor[1] = 1.0;
3847 LightColor[2] = 1.0;
3851 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3852 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3853 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3854 DPSOFTRAST_Vector3Normalize(lightnormal);
3857 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3859 if(thread->shader_exactspecularmath)
3861 // reflect lightnormal at surfacenormal, take the negative of that
3862 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3864 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3865 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3866 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3867 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3869 // dot of this and normalize(EyeVectorFogDepth.xyz)
3870 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3871 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3872 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3873 DPSOFTRAST_Vector3Normalize(eyenormal);
3875 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3879 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3880 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3881 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3882 DPSOFTRAST_Vector3Normalize(eyenormal);
3884 specularnormal[0] = lightnormal[0] + eyenormal[0];
3885 specularnormal[1] = lightnormal[1] + eyenormal[1];
3886 specularnormal[2] = lightnormal[2] + eyenormal[2];
3887 DPSOFTRAST_Vector3Normalize(specularnormal);
3889 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3891 specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
3893 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3895 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3896 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3897 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3898 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3902 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3903 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3904 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3905 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3908 buffer_FragColorbgra8[x*4+0] = d[0];
3909 buffer_FragColorbgra8[x*4+1] = d[1];
3910 buffer_FragColorbgra8[x*4+2] = d[2];
3911 buffer_FragColorbgra8[x*4+3] = d[3];
3914 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3916 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3917 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3918 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3919 Color_Diffuse[3] = 0.0f;
3920 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3921 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3922 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3923 LightColor[3] = 0.0f;
3924 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3926 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3928 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3929 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3930 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3931 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3932 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3934 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3936 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3937 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3939 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3941 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3945 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3948 for (x = startx;x < endx;x++)
3951 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3952 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3953 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3954 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3955 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3956 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3957 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3958 DPSOFTRAST_Vector3Normalize(surfacenormal);
3960 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3962 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3963 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3964 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3965 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3967 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3968 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3969 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3970 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3972 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3973 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3974 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3975 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3977 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3978 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3979 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3980 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3982 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3983 DPSOFTRAST_Vector3Normalize(lightnormal);
3985 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3987 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3988 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3989 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3990 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3993 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3995 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3996 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3997 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3999 float f = 1.0f / 256.0f;
4000 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
4001 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
4002 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
4005 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
4007 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4008 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4009 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4010 DPSOFTRAST_Vector3Normalize(lightnormal);
4012 LightColor[0] = 1.0;
4013 LightColor[1] = 1.0;
4014 LightColor[2] = 1.0;
4018 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4019 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4020 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4021 DPSOFTRAST_Vector3Normalize(lightnormal);
4024 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4025 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4027 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4028 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4029 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4030 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4034 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4035 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4036 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4037 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4039 buffer_FragColorbgra8[x*4+0] = d[0];
4040 buffer_FragColorbgra8[x*4+1] = d[1];
4041 buffer_FragColorbgra8[x*4+2] = d[2];
4042 buffer_FragColorbgra8[x*4+3] = d[3];
4047 for (x = startx;x < endx;x++)
4050 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4051 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4052 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4053 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4055 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4057 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4058 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4059 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4060 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4064 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4065 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4066 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4067 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4069 buffer_FragColorbgra8[x*4+0] = d[0];
4070 buffer_FragColorbgra8[x*4+1] = d[1];
4071 buffer_FragColorbgra8[x*4+2] = d[2];
4072 buffer_FragColorbgra8[x*4+3] = d[3];
4075 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4080 static void DPSOFTRAST_VertexShader_LightSource(void)
4083 int numvertices = dpsoftrast.numvertices;
4084 float LightPosition[4];
4085 float LightVector[4];
4086 float LightVectorModelSpace[4];
4087 float EyePosition[4];
4088 float EyeVectorModelSpace[4];
4094 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4095 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4096 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4097 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4098 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4099 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4100 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4101 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4102 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4103 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4104 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4105 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4106 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4107 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4108 for (i = 0;i < numvertices;i++)
4110 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4111 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4112 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4113 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4114 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4115 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4116 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4117 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4118 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4119 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4120 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4121 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4122 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4123 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4124 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4125 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4126 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4127 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
4128 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4129 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4130 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4131 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4132 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4133 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4134 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4135 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4136 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4137 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4138 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4139 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4140 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4141 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4143 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4144 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4147 static void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4150 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4151 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4152 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4153 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4154 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4155 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4156 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4157 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4158 int x, startx = span->startx, endx = span->endx;
4159 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4160 float CubeVectordata[4];
4161 float CubeVectorslope[4];
4162 float LightVectordata[4];
4163 float LightVectorslope[4];
4164 float EyeVectordata[4];
4165 float EyeVectorslope[4];
4167 float diffusetex[4];
4169 float surfacenormal[4];
4170 float lightnormal[4];
4172 float specularnormal[4];
4175 float SpecularPower;
4176 float CubeVector[4];
4179 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4180 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4181 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4182 Color_Glow[3] = 0.0f;
4183 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4184 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4185 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4186 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4187 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4188 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4189 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4190 Color_Diffuse[3] = 0.0f;
4191 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4192 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4193 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4194 Color_Specular[3] = 0.0f;
4195 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4196 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4197 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4198 Color_Pants[3] = 0.0f;
4199 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4200 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4201 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4202 Color_Shirt[3] = 0.0f;
4203 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4204 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4205 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4206 LightColor[3] = 0.0f;
4207 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4208 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4209 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4210 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4211 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4212 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4213 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4214 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4216 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4217 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4219 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4220 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4221 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4223 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4224 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4225 for (x = startx;x < endx;x++)
4228 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4229 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4230 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4231 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4232 if (attenuation < 0.01f)
4234 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4236 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4237 if (attenuation < 0.01f)
4241 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4242 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4243 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4244 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4245 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4247 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4248 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4249 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4250 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4252 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4253 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4254 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4255 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4256 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4257 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4258 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4259 DPSOFTRAST_Vector3Normalize(surfacenormal);
4261 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4262 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4263 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4264 DPSOFTRAST_Vector3Normalize(lightnormal);
4266 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4268 if(thread->shader_exactspecularmath)
4270 // reflect lightnormal at surfacenormal, take the negative of that
4271 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4273 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4274 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4275 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4276 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4278 // dot of this and normalize(EyeVectorFogDepth.xyz)
4279 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4280 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4281 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4282 DPSOFTRAST_Vector3Normalize(eyenormal);
4284 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4288 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4289 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4290 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4291 DPSOFTRAST_Vector3Normalize(eyenormal);
4293 specularnormal[0] = lightnormal[0] + eyenormal[0];
4294 specularnormal[1] = lightnormal[1] + eyenormal[1];
4295 specularnormal[2] = lightnormal[2] + eyenormal[2];
4296 DPSOFTRAST_Vector3Normalize(specularnormal);
4298 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4300 specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
4302 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4304 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4305 attenuation *= (1.0f / 255.0f);
4306 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4307 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4308 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4309 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4313 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4314 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4315 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4316 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4318 buffer_FragColorbgra8[x*4+0] = d[0];
4319 buffer_FragColorbgra8[x*4+1] = d[1];
4320 buffer_FragColorbgra8[x*4+2] = d[2];
4321 buffer_FragColorbgra8[x*4+3] = d[3];
4324 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4326 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4327 for (x = startx;x < endx;x++)
4330 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4331 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4332 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4333 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4334 if (attenuation < 0.01f)
4336 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4338 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4339 if (attenuation < 0.01f)
4343 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4344 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4345 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4346 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4347 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4349 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4350 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4351 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4352 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4354 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4355 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4356 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4357 DPSOFTRAST_Vector3Normalize(surfacenormal);
4359 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4360 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4361 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4362 DPSOFTRAST_Vector3Normalize(lightnormal);
4364 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4365 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4367 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4368 attenuation *= (1.0f / 255.0f);
4369 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4370 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4371 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4372 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4376 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4377 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4378 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4379 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4381 buffer_FragColorbgra8[x*4+0] = d[0];
4382 buffer_FragColorbgra8[x*4+1] = d[1];
4383 buffer_FragColorbgra8[x*4+2] = d[2];
4384 buffer_FragColorbgra8[x*4+3] = d[3];
4389 for (x = startx;x < endx;x++)
4392 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4393 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4394 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4395 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4396 if (attenuation < 0.01f)
4398 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4400 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4401 if (attenuation < 0.01f)
4405 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4406 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4407 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4408 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4409 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4411 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4412 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4413 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4414 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4416 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4418 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4419 attenuation *= (1.0f / 255.0f);
4420 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4421 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4422 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4423 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4427 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4428 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4429 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4430 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4432 buffer_FragColorbgra8[x*4+0] = d[0];
4433 buffer_FragColorbgra8[x*4+1] = d[1];
4434 buffer_FragColorbgra8[x*4+2] = d[2];
4435 buffer_FragColorbgra8[x*4+3] = d[3];
4438 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4444 static void DPSOFTRAST_VertexShader_Refraction(void)
4446 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4447 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4448 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4451 static void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4453 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4455 int x, startx = span->startx, endx = span->endx;
4458 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4459 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4462 float ModelViewProjectionPositiondata[4];
4463 float ModelViewProjectionPositionslope[4];
4466 float ScreenScaleRefractReflect[2];
4467 float ScreenCenterRefractReflect[2];
4468 float DistortScaleRefractReflect[2];
4469 float RefractColor[4];
4471 DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4472 if(!texture) return;
4475 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4476 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4479 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4482 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4483 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4484 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4485 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4486 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4487 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4488 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4489 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4490 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4491 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4494 for (x = startx;x < endx;x++)
4496 float SafeScreenTexCoord[2];
4497 float ScreenTexCoord[2];
4504 // " vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4505 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4507 // " vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4508 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4509 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4511 // " vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4512 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4513 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4514 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4515 DPSOFTRAST_Vector3Normalize(v);
4516 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4517 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4519 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4520 DPSOFTRAST_Texture2DBGRA8(texture, 0, ScreenTexCoord[0], ScreenTexCoord[1], c);
4522 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4523 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4524 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4525 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4528 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4533 static void DPSOFTRAST_VertexShader_Water(void)
4536 int numvertices = dpsoftrast.numvertices;
4537 float EyePosition[4];
4538 float EyeVectorModelSpace[4];
4544 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4545 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4546 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4547 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4548 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4549 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4550 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4551 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4552 for (i = 0;i < numvertices;i++)
4554 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4555 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4556 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4557 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4558 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4559 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4560 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4561 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4562 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4563 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4564 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4565 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4566 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4567 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4568 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4569 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4570 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4571 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4572 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
4573 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
4574 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
4575 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
4577 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4578 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4579 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4583 static void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4585 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4587 int x, startx = span->startx, endx = span->endx;
4590 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4591 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4594 float ModelViewProjectionPositiondata[4];
4595 float ModelViewProjectionPositionslope[4];
4596 float EyeVectordata[4];
4597 float EyeVectorslope[4];
4600 float ScreenScaleRefractReflect[4];
4601 float ScreenCenterRefractReflect[4];
4602 float DistortScaleRefractReflect[4];
4603 float RefractColor[4];
4604 float ReflectColor[4];
4605 float ReflectFactor;
4606 float ReflectOffset;
4608 DPSOFTRAST_Texture *texture_refraction = thread->texbound[GL20TU_REFRACTION];
4609 DPSOFTRAST_Texture *texture_reflection = thread->texbound[GL20TU_REFLECTION];
4610 if(!texture_refraction || !texture_reflection) return;
4613 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4614 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4617 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4618 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
4621 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4622 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4623 ScreenScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+2];
4624 ScreenScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+3];
4625 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4626 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4627 ScreenCenterRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+2];
4628 ScreenCenterRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+3];
4629 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4630 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4631 DistortScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+2];
4632 DistortScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+3];
4633 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4634 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4635 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4636 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4637 ReflectColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+2];
4638 ReflectColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+1];
4639 ReflectColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+0];
4640 ReflectColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+3];
4641 ReflectFactor = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectFactor*4+0];
4642 ReflectOffset = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectOffset*4+0];
4645 for (x = startx;x < endx;x++)
4647 float SafeScreenTexCoord[4];
4648 float ScreenTexCoord[4];
4651 unsigned char c1[4];
4652 unsigned char c2[4];
4657 // " vec4 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect * (1.0 / ModelViewProjectionPosition.w);\n"
4658 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4660 // " vec4 SafeScreenTexCoord = ModelViewProjectionPosition.xyxy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect;\n"
4661 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4662 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4663 SafeScreenTexCoord[2] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[2] + ScreenCenterRefractReflect[2]; // * z (disappears)
4664 SafeScreenTexCoord[3] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[3] + ScreenCenterRefractReflect[3]; // * z (disappears)
4666 // " vec4 ScreenTexCoord = SafeScreenTexCoord + vec2(normalize(vec3(dp_texture2D(Texture_Normal, TexCoord)) - vec3(0.5))).xyxy * DistortScaleRefractReflect;\n"
4667 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4668 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4669 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4670 DPSOFTRAST_Vector3Normalize(v);
4671 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4672 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4673 ScreenTexCoord[2] = SafeScreenTexCoord[2] + v[0] * DistortScaleRefractReflect[2];
4674 ScreenTexCoord[3] = SafeScreenTexCoord[3] + v[1] * DistortScaleRefractReflect[3];
4676 // " float Fresnel = pow(min(1.0, 1.0 - float(normalize(EyeVector).z)), 2.0) * ReflectFactor + ReflectOffset;\n"
4677 v[0] = (EyeVectordata[0] + EyeVectorslope[0] * x); // * z (disappears)
4678 v[1] = (EyeVectordata[1] + EyeVectorslope[1] * x); // * z (disappears)
4679 v[2] = (EyeVectordata[2] + EyeVectorslope[2] * x); // * z (disappears)
4680 DPSOFTRAST_Vector3Normalize(v);
4681 Fresnel = 1.0f - v[2];
4682 Fresnel = min(1.0f, Fresnel);
4683 Fresnel = Fresnel * Fresnel * ReflectFactor + ReflectOffset;
4685 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4686 // " dp_FragColor = mix(vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord.xy).rgb, 1) * RefractColor, vec4(dp_texture2D(Texture_Reflection, ScreenTexCoord.zw).rgb, 1) * ReflectColor, Fresnel);\n"
4687 DPSOFTRAST_Texture2DBGRA8(texture_refraction, 0, ScreenTexCoord[0], ScreenTexCoord[1], c1);
4688 DPSOFTRAST_Texture2DBGRA8(texture_reflection, 0, ScreenTexCoord[2], ScreenTexCoord[3], c2);
4690 buffer_FragColorbgra8[x*4+0] = (c1[0] * RefractColor[0]) * (1.0f - Fresnel) + (c2[0] * ReflectColor[0]) * Fresnel;
4691 buffer_FragColorbgra8[x*4+1] = (c1[1] * RefractColor[1]) * (1.0f - Fresnel) + (c2[1] * ReflectColor[1]) * Fresnel;
4692 buffer_FragColorbgra8[x*4+2] = (c1[2] * RefractColor[2]) * (1.0f - Fresnel) + (c2[2] * ReflectColor[2]) * Fresnel;
4693 buffer_FragColorbgra8[x*4+3] = min(( RefractColor[3] * (1.0f - Fresnel) + ReflectColor[3] * Fresnel) * 256, 255);
4696 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4701 static void DPSOFTRAST_VertexShader_ShowDepth(void)
4703 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4706 static void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4709 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4710 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4711 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4712 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4713 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4718 static void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4720 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4723 static void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4726 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4727 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4728 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4729 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4730 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4735 static void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4737 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4740 static void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4743 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4744 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4745 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4746 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4747 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4752 typedef struct DPSOFTRAST_ShaderModeInfo_s
4755 void (*Vertex)(void);
4756 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4757 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4758 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4760 DPSOFTRAST_ShaderModeInfo;
4762 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4764 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4765 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4766 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4767 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4768 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4769 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4770 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4771 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4772 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4773 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4774 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4775 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4776 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4777 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4778 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_NORMAL, GL20TU_REFLECTION, GL20TU_REFRACTION, ~0}},
4779 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4780 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4781 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4784 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4789 unsigned int *depthpixel;
4793 unsigned char *pixelmask;
4794 DPSOFTRAST_State_Triangle *triangle;
4795 triangle = &thread->triangles[span->triangle];
4796 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4797 startx = span->startx;
4799 depth = span->depthbase;
4800 depthslope = span->depthslope;
4801 pixelmask = thread->pixelmaskarray;
4802 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4804 switch(thread->fb_depthfunc)
4807 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4808 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4809 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4810 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4811 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4812 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4813 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4815 while (startx < endx && !pixelmask[startx])
4817 while (endx > startx && !pixelmask[endx-1])
4822 // no depth testing means we're just dealing with color...
4823 memset(pixelmask + startx, 1, endx - startx);
4825 span->pixelmask = pixelmask;
4826 span->startx = startx;
4830 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4832 int x, d, depth, depthslope, startx, endx;
4833 const unsigned char *pixelmask;
4834 unsigned int *depthpixel;
4835 if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4837 depth = span->depthbase;
4838 depthslope = span->depthslope;
4839 pixelmask = span->pixelmask;
4840 startx = span->startx;
4842 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4843 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4849 static void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4852 DPSOFTRAST_State_Triangle *triangle;
4853 DPSOFTRAST_State_Span *span;
4854 for (i = 0; i < thread->numspans; i++)
4856 span = &thread->spans[i];
4857 triangle = &thread->triangles[span->triangle];
4858 DPSOFTRAST_Draw_DepthTest(thread, span);
4859 if (span->startx >= span->endx)
4861 // run pixel shader if appropriate
4862 // do this before running depthmask code, to allow the pixelshader
4863 // to clear pixelmask values for alpha testing
4864 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4865 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4866 DPSOFTRAST_Draw_DepthWrite(thread, span);
4868 thread->numspans = 0;
4871 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;)
4873 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4876 int cullface = thread->cullface;
4877 int minx, maxx, miny, maxy;
4878 int miny1, maxy1, miny2, maxy2;
4879 __m128i fbmin, fbmax;
4880 __m128 viewportcenter, viewportscale;
4881 int firstvertex = command->firstvertex;
4882 int numvertices = command->numvertices;
4883 int numtriangles = command->numtriangles;
4884 const int *element3i = command->element3i;
4885 const unsigned short *element3s = command->element3s;
4886 int clipped = command->clipped;
4893 int starty, endy, bandy;
4897 float clip0origin, clip0slope;
4899 __m128 triangleedge1, triangleedge2, trianglenormal;
4902 DPSOFTRAST_State_Triangle *triangle;
4903 DPSOFTRAST_Texture *texture;
4904 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4905 miny = thread->fb_scissor[1];
4906 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4907 miny1 = bound(miny, thread->miny1, maxy);
4908 maxy1 = bound(miny, thread->maxy1, maxy);
4909 miny2 = bound(miny, thread->miny2, maxy);
4910 maxy2 = bound(miny, thread->maxy2, maxy);
4911 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4913 if (!ATOMIC_DECREMENT(command->refcount))
4915 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4916 MM_FREE(command->arrays);
4920 minx = thread->fb_scissor[0];
4921 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4922 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4923 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4924 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4925 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4926 screen[3] = _mm_setzero_ps();
4927 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4928 for (i = 0;i < numtriangles;i++)
4930 const float *screencoord4f = command->arrays;
4931 const float *arrays = screencoord4f + numvertices*4;
4933 // generate the 3 edges of this triangle
4934 // generate spans for the triangle - switch based on left split or right split classification of triangle
4937 e[0] = element3s[i*3+0] - firstvertex;
4938 e[1] = element3s[i*3+1] - firstvertex;
4939 e[2] = element3s[i*3+2] - firstvertex;
4943 e[0] = element3i[i*3+0] - firstvertex;
4944 e[1] = element3i[i*3+1] - firstvertex;
4945 e[2] = element3i[i*3+2] - firstvertex;
4954 #define SKIPBACKFACE \
4955 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4956 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4957 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4958 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4959 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4963 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4967 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4972 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4973 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4975 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4976 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4978 #define CLIPPEDVERTEXCOPY(k,p1) \
4979 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4981 #define GENATTRIBCOPY(attrib, p1) \
4982 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4983 #define GENATTRIBLERP(attrib, p1, p2) \
4985 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4986 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4988 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4992 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4993 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4994 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4995 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4996 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4997 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4998 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
5004 // calculate distance from nearplane
5005 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
5006 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
5007 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
5008 if (clipdist[0] >= 0.0f)
5010 if (clipdist[1] >= 0.0f)
5012 if (clipdist[2] >= 0.0f)
5015 // triangle is entirely in front of nearplane
5016 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
5023 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
5031 if (clipdist[2] >= 0.0f)
5033 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
5040 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
5047 else if (clipdist[1] >= 0.0f)
5049 if (clipdist[2] >= 0.0f)
5051 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
5058 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
5064 else if (clipdist[2] >= 0.0f)
5066 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
5071 else continue; // triangle is entirely behind nearplane
5074 // calculate integer y coords for triangle points
5075 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
5076 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
5077 screenmin = _mm_min_epi16(screeni, screenir),
5078 screenmax = _mm_max_epi16(screeni, screenir);
5079 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
5080 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
5081 screenmin = _mm_max_epi16(screenmin, fbmin);
5082 screenmax = _mm_min_epi16(screenmax, fbmax);
5083 // skip offscreen triangles
5084 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5086 starty = _mm_extract_epi16(screenmin, 1);
5087 endy = _mm_extract_epi16(screenmax, 1)+1;
5088 if (starty >= maxy1 && endy <= miny2)
5090 screeny = _mm_srai_epi32(screeni, 16);
5093 triangle = &thread->triangles[thread->numtriangles];
5095 // calculate attribute plans for triangle data...
5096 // okay, this triangle is going to produce spans, we'd better project
5097 // the interpolants now (this is what gives perspective texturing),
5098 // this consists of simply multiplying all arrays by the W coord
5099 // (which is basically 1/Z), which will be undone per-pixel
5100 // (multiplying by Z again) to get the perspective-correct array
5103 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5104 __m128 mipedgescale, mipdensity;
5105 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5106 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5107 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5108 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5109 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5110 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5111 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5112 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5113 attribedge1 = _mm_sub_ss(w0, w1);
5114 attribedge2 = _mm_sub_ss(w2, w1);
5115 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5116 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5117 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5118 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5119 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5120 _mm_store_ss(&triangle->w[0], attribxslope);
5121 _mm_store_ss(&triangle->w[1], attribyslope);
5122 _mm_store_ss(&triangle->w[2], attriborigin);
5127 if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
5129 float cliporigin, clipxslope, clipyslope;
5130 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5131 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5132 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5133 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5134 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5135 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5136 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
5137 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
5138 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
5141 clip0origin = -cliporigin/clipxslope;
5142 clip0slope = -clipyslope/clipxslope;
5143 clip0dir = clipxslope > 0 ? 1 : -1;
5145 else if(clipyslope > 0)
5147 clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5148 clip0slope = dpsoftrast.fb_width;
5151 else if(clipyslope < 0)
5153 clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5154 clip0slope = -dpsoftrast.fb_width;
5157 else if(clip0origin < 0) continue;
5160 mipedgescale = _mm_setzero_ps();
5161 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5163 __m128 attrib0, attrib1, attrib2;
5164 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5165 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5167 arrays += numvertices*4;
5168 GENATTRIBS(attrib0, attrib1, attrib2);
5169 attriborigin = _mm_mul_ps(attrib1, w1);
5170 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5171 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5172 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5173 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5174 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5175 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5176 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5177 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5178 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5180 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5181 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5182 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5183 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5187 memset(triangle->mip, 0, sizeof(triangle->mip));
5188 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5190 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5191 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5193 texture = thread->texbound[texunit];
5194 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5196 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5197 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5198 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5199 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5200 // this will be multiplied in the texturing routine by the texture resolution
5201 y = _mm_cvtss_si32(mipdensity);
5204 y = (int)(log((float)y)*0.5f/M_LN2);
5205 if (y > texture->mipmaps - 1)
5206 y = texture->mipmaps - 1;
5207 triangle->mip[texunit] = y;
5213 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5216 __m128 xcoords, xslope;
5217 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5218 int yccmask = _mm_movemask_epi8(ycc);
5219 int edge0p, edge0n, edge1p, edge1n;
5228 case 0xFFFF: /*0000*/ y = endy; continue;
5229 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5230 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5231 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5232 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5233 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5234 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5235 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5236 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5237 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5238 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5239 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5240 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5241 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5242 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5243 case 0x0000: /*1111*/ y++; continue;
5251 case 0xFFFF: /*000*/ y = endy; continue;
5252 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5253 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5254 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5255 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5256 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5257 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5258 case 0x0000: /*111*/ y++; continue;
5261 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5262 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5263 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5264 nexty = _mm_extract_epi16(ycc, 0);
5265 if (nexty >= bandy) nexty = bandy-1;
5266 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5267 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5268 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5269 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5270 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5271 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5273 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5274 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5276 clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5277 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5279 int startx, endx, offset;
5280 startx = _mm_cvtss_si32(xcoords);
5281 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5282 if (startx < minx) startx = minx;
5283 if (endx > maxx) endx = maxx;
5284 if (startx >= endx) continue;
5292 if(endx <= clip0) continue;
5293 startx = (int)clip0;
5296 else if (endx > clip0)
5298 if(startx >= clip0) continue;
5303 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5305 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5306 span->triangle = thread->numtriangles;
5310 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5311 if (span->startx >= span->endx)
5313 wslope = triangle->w[0];
5314 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5315 span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5316 span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5317 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5318 DPSOFTRAST_Draw_ProcessSpans(thread);
5323 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5325 DPSOFTRAST_Draw_ProcessSpans(thread);
5326 thread->numtriangles = 0;
5330 if (!ATOMIC_DECREMENT(command->refcount))
5332 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5333 MM_FREE(command->arrays);
5336 if (thread->numspans > 0 || thread->numtriangles > 0)
5338 DPSOFTRAST_Draw_ProcessSpans(thread);
5339 thread->numtriangles = 0;
5344 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5348 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5349 int datasize = 2*numvertices*sizeof(float[4]);
5350 DPSOFTRAST_Command_Draw *command;
5351 unsigned char *data;
5352 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5354 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5355 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5357 datasize += numvertices*sizeof(float[4]);
5360 datasize += numtriangles*sizeof(unsigned short[3]);
5362 datasize += numtriangles*sizeof(int[3]);
5363 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5364 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5366 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5367 data = (unsigned char *)MM_CALLOC(datasize, 1);
5371 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5372 data = (unsigned char *)command + commandsize;
5374 command->firstvertex = firstvertex;
5375 command->numvertices = numvertices;
5376 command->numtriangles = numtriangles;
5377 command->arrays = (float *)data;
5378 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5379 dpsoftrast.firstvertex = firstvertex;
5380 dpsoftrast.numvertices = numvertices;
5381 dpsoftrast.screencoord4f = (float *)data;
5382 data += numvertices*sizeof(float[4]);
5383 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5384 data += numvertices*sizeof(float[4]);
5385 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5387 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5388 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5390 dpsoftrast.post_array4f[j] = (float *)data;
5391 data += numvertices*sizeof(float[4]);
5393 command->element3i = NULL;
5394 command->element3s = NULL;
5397 command->element3s = (unsigned short *)data;
5398 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5402 command->element3i = (int *)data;
5403 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5408 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5410 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5411 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5412 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5413 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5414 if (command->starty >= command->endy)
5416 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5417 MM_FREE(command->arrays);
5418 DPSOFTRAST_UndoCommand(command->commandsize);
5421 command->clipped = dpsoftrast.drawclipped;
5422 command->refcount = dpsoftrast.numthreads;
5424 if (dpsoftrast.usethreads)
5427 DPSOFTRAST_Draw_SyncCommands();
5428 for (i = 0; i < dpsoftrast.numthreads; i++)
5430 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5431 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5432 Thread_CondSignal(thread->drawcond);
5437 DPSOFTRAST_Draw_FlushThreads();
5441 DEFCOMMAND(23, SetRenderTargets, int width; int height;)
5442 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5444 thread->validate |= DPSOFTRAST_VALIDATE_FB;
5446 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5448 DPSOFTRAST_Command_SetRenderTargets *command;
5449 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5450 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5451 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5453 dpsoftrast.fb_width = width;
5454 dpsoftrast.fb_height = height;
5455 dpsoftrast.fb_depthpixels = depthpixels;
5456 dpsoftrast.fb_colorpixels[0] = colorpixels0;
5457 dpsoftrast.fb_colorpixels[1] = colorpixels1;
5458 dpsoftrast.fb_colorpixels[2] = colorpixels2;
5459 dpsoftrast.fb_colorpixels[3] = colorpixels3;
5460 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5461 command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5462 command->width = width;
5463 command->height = height;
5466 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5468 int commandoffset = thread->commandoffset;
5469 while (commandoffset != endoffset)
5471 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5472 switch (command->opcode)
5474 #define INTERPCOMMAND(name) \
5475 case DPSOFTRAST_OPCODE_##name : \
5476 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5477 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5478 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5479 commandoffset = 0; \
5481 INTERPCOMMAND(Viewport)
5482 INTERPCOMMAND(ClearColor)
5483 INTERPCOMMAND(ClearDepth)
5484 INTERPCOMMAND(ColorMask)
5485 INTERPCOMMAND(DepthTest)
5486 INTERPCOMMAND(ScissorTest)
5487 INTERPCOMMAND(Scissor)
5488 INTERPCOMMAND(BlendFunc)
5489 INTERPCOMMAND(BlendSubtract)
5490 INTERPCOMMAND(DepthMask)
5491 INTERPCOMMAND(DepthFunc)
5492 INTERPCOMMAND(DepthRange)
5493 INTERPCOMMAND(PolygonOffset)
5494 INTERPCOMMAND(CullFace)
5495 INTERPCOMMAND(SetTexture)
5496 INTERPCOMMAND(SetShader)
5497 INTERPCOMMAND(Uniform4f)
5498 INTERPCOMMAND(UniformMatrix4f)
5499 INTERPCOMMAND(Uniform1i)
5500 INTERPCOMMAND(SetRenderTargets)
5501 INTERPCOMMAND(ClipPlane)
5503 case DPSOFTRAST_OPCODE_Draw:
5504 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5505 commandoffset += command->commandsize;
5506 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5508 thread->commandoffset = commandoffset;
5511 case DPSOFTRAST_OPCODE_Reset:
5516 thread->commandoffset = commandoffset;
5519 static int DPSOFTRAST_Draw_Thread(void *data)
5521 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5522 while(thread->index >= 0)
5524 if (thread->commandoffset != dpsoftrast.drawcommand)
5526 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5530 Thread_LockMutex(thread->drawmutex);
5531 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5533 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5534 thread->starving = true;
5535 Thread_CondWait(thread->drawcond, thread->drawmutex);
5536 thread->starving = false;
5538 Thread_UnlockMutex(thread->drawmutex);
5544 static void DPSOFTRAST_Draw_FlushThreads(void)
5546 DPSOFTRAST_State_Thread *thread;
5548 DPSOFTRAST_Draw_SyncCommands();
5549 if (dpsoftrast.usethreads)
5551 for (i = 0; i < dpsoftrast.numthreads; i++)
5553 thread = &dpsoftrast.threads[i];
5554 if (thread->commandoffset != dpsoftrast.drawcommand)
5556 Thread_LockMutex(thread->drawmutex);
5557 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5558 Thread_CondSignal(thread->drawcond);
5559 Thread_UnlockMutex(thread->drawmutex);
5562 for (i = 0; i < dpsoftrast.numthreads; i++)
5564 thread = &dpsoftrast.threads[i];
5565 if (thread->commandoffset != dpsoftrast.drawcommand)
5567 Thread_LockMutex(thread->drawmutex);
5568 if (thread->commandoffset != dpsoftrast.drawcommand)
5570 thread->waiting = true;
5571 Thread_CondWait(thread->waitcond, thread->drawmutex);
5572 thread->waiting = false;
5574 Thread_UnlockMutex(thread->drawmutex);
5580 for (i = 0; i < dpsoftrast.numthreads; i++)
5582 thread = &dpsoftrast.threads[i];
5583 if (thread->commandoffset != dpsoftrast.drawcommand)
5584 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5587 dpsoftrast.commandpool.usedcommands = 0;
5590 void DPSOFTRAST_Flush(void)
5592 DPSOFTRAST_Draw_FlushThreads();
5595 void DPSOFTRAST_Finish(void)
5600 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5610 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5611 dpsoftrast.bigendian = u.b[3];
5612 dpsoftrast.fb_width = width;
5613 dpsoftrast.fb_height = height;
5614 dpsoftrast.fb_depthpixels = depthpixels;
5615 dpsoftrast.fb_colorpixels[0] = colorpixels;
5616 dpsoftrast.fb_colorpixels[1] = NULL;
5617 dpsoftrast.fb_colorpixels[1] = NULL;
5618 dpsoftrast.fb_colorpixels[1] = NULL;
5619 dpsoftrast.viewport[0] = 0;
5620 dpsoftrast.viewport[1] = 0;
5621 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5622 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5623 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5624 dpsoftrast.texture_firstfree = 1;
5625 dpsoftrast.texture_end = 1;
5626 dpsoftrast.texture_max = 0;
5627 dpsoftrast.color[0] = 1;
5628 dpsoftrast.color[1] = 1;
5629 dpsoftrast.color[2] = 1;
5630 dpsoftrast.color[3] = 1;
5631 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5632 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5633 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5634 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5635 for (i = 0; i < dpsoftrast.numthreads; i++)
5637 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5639 thread->cullface = GL_BACK;
5640 thread->colormask[0] = 1;
5641 thread->colormask[1] = 1;
5642 thread->colormask[2] = 1;
5643 thread->colormask[3] = 1;
5644 thread->blendfunc[0] = GL_ONE;
5645 thread->blendfunc[1] = GL_ZERO;
5646 thread->depthmask = true;
5647 thread->depthtest = true;
5648 thread->depthfunc = GL_LEQUAL;
5649 thread->scissortest = false;
5650 thread->viewport[0] = 0;
5651 thread->viewport[1] = 0;
5652 thread->viewport[2] = dpsoftrast.fb_width;
5653 thread->viewport[3] = dpsoftrast.fb_height;
5654 thread->scissor[0] = 0;
5655 thread->scissor[1] = 0;
5656 thread->scissor[2] = dpsoftrast.fb_width;
5657 thread->scissor[3] = dpsoftrast.fb_height;
5658 thread->depthrange[0] = 0;
5659 thread->depthrange[1] = 1;
5660 thread->polygonoffset[0] = 0;
5661 thread->polygonoffset[1] = 0;
5662 thread->clipplane[0] = 0;
5663 thread->clipplane[1] = 0;
5664 thread->clipplane[2] = 0;
5665 thread->clipplane[3] = 1;
5667 thread->numspans = 0;
5668 thread->numtriangles = 0;
5669 thread->commandoffset = 0;
5670 thread->waiting = false;
5671 thread->starving = false;
5673 thread->validate = -1;
5674 DPSOFTRAST_Validate(thread, -1);
5676 if (dpsoftrast.usethreads)
5678 thread->waitcond = Thread_CreateCond();
5679 thread->drawcond = Thread_CreateCond();
5680 thread->drawmutex = Thread_CreateMutex();
5681 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5687 void DPSOFTRAST_Shutdown(void)
5690 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5692 DPSOFTRAST_State_Thread *thread;
5693 for (i = 0; i < dpsoftrast.numthreads; i++)
5695 thread = &dpsoftrast.threads[i];
5696 Thread_LockMutex(thread->drawmutex);
5698 Thread_CondSignal(thread->drawcond);
5699 Thread_UnlockMutex(thread->drawmutex);
5700 Thread_WaitThread(thread->thread, 0);
5701 Thread_DestroyCond(thread->waitcond);
5702 Thread_DestroyCond(thread->drawcond);
5703 Thread_DestroyMutex(thread->drawmutex);
5706 for (i = 0;i < dpsoftrast.texture_end;i++)
5707 if (dpsoftrast.texture[i].bytes)
5708 MM_FREE(dpsoftrast.texture[i].bytes);
5709 if (dpsoftrast.texture)
5710 free(dpsoftrast.texture);
5711 if (dpsoftrast.threads)
5712 MM_FREE(dpsoftrast.threads);
5713 memset(&dpsoftrast, 0, sizeof(dpsoftrast));