3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 #pragma warning(disable : 4324)
14 typedef qboolean bool;
21 #if defined(__APPLE__)
22 #include <libkern/OSAtomic.h>
23 #define ALIGN(var) var __attribute__((__aligned__(16)))
24 #define ATOMIC(var) var __attribute__((__aligned__(4)))
25 #define MEMORY_BARRIER (_mm_sfence())
26 #define ATOMIC_COUNTER volatile int32_t
27 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30 #elif defined(__GNUC__) && defined(WIN32)
31 #define ALIGN(var) var __attribute__((__aligned__(16)))
32 #define ATOMIC(var) var __attribute__((__aligned__(4)))
33 #define MEMORY_BARRIER (_mm_sfence())
34 //(__sync_synchronize())
35 #define ATOMIC_COUNTER volatile LONG
36 // this LONG * cast serves to fix an issue with broken mingw
37 // packages on Ubuntu; these only declare the function to take
38 // a LONG *, causing a compile error here. This seems to be
39 // error- and warn-free on platforms that DO declare
40 // InterlockedIncrement correctly, like mingw on Windows.
41 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement((LONG *) &(counter)))
42 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement((LONG *) &(counter)))
43 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd((LONG *) &(counter), (val)))
44 #elif defined(__GNUC__)
45 #define ALIGN(var) var __attribute__((__aligned__(16)))
46 #define ATOMIC(var) var __attribute__((__aligned__(4)))
47 #define MEMORY_BARRIER (_mm_sfence())
48 //(__sync_synchronize())
49 #define ATOMIC_COUNTER volatile int
50 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
51 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
52 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
53 #elif defined(_MSC_VER)
54 #define ALIGN(var) __declspec(align(16)) var
55 #define ATOMIC(var) __declspec(align(4)) var
56 #define MEMORY_BARRIER (_mm_sfence())
58 #define ATOMIC_COUNTER volatile LONG
59 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
60 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
61 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
66 #define ALIGN(var) var
69 #define ATOMIC(var) var
71 #ifndef MEMORY_BARRIER
72 #define MEMORY_BARRIER ((void)0)
74 #ifndef ATOMIC_COUNTER
75 #define ATOMIC_COUNTER int
77 #ifndef ATOMIC_INCREMENT
78 #define ATOMIC_INCREMENT(counter) (++(counter))
80 #ifndef ATOMIC_DECREMENT
81 #define ATOMIC_DECREMENT(counter) (--(counter))
84 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
88 #include <emmintrin.h>
90 #if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6) && !defined(__clang__)
91 #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
94 #define MM_MALLOC(size) _mm_malloc(size, ALIGN_SIZE)
96 static void *MM_CALLOC(size_t nmemb, size_t size)
98 void *ptr = _mm_malloc(nmemb*size, ALIGN_SIZE);
99 if (ptr != NULL) memset(ptr, 0, nmemb*size);
103 #define MM_FREE _mm_free
105 #define MM_MALLOC(size) malloc(size)
106 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
110 typedef enum DPSOFTRAST_ARRAY_e
112 DPSOFTRAST_ARRAY_POSITION,
113 DPSOFTRAST_ARRAY_COLOR,
114 DPSOFTRAST_ARRAY_TEXCOORD0,
115 DPSOFTRAST_ARRAY_TEXCOORD1,
116 DPSOFTRAST_ARRAY_TEXCOORD2,
117 DPSOFTRAST_ARRAY_TEXCOORD3,
118 DPSOFTRAST_ARRAY_TEXCOORD4,
119 DPSOFTRAST_ARRAY_TEXCOORD5,
120 DPSOFTRAST_ARRAY_TEXCOORD6,
121 DPSOFTRAST_ARRAY_TEXCOORD7,
122 DPSOFTRAST_ARRAY_TOTAL
126 typedef struct DPSOFTRAST_Texture_s
133 DPSOFTRAST_TEXTURE_FILTER filter;
136 ATOMIC_COUNTER binds;
137 unsigned char *bytes;
138 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
142 #define COMMAND_SIZE ALIGN_SIZE
143 #define COMMAND_ALIGN(var) ALIGN(var)
145 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
147 unsigned char opcode;
148 unsigned short commandsize;
152 enum { DPSOFTRAST_OPCODE_Reset = 0 };
154 #define DEFCOMMAND(opcodeval, name, fields) \
155 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
156 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
158 unsigned char opcode; \
159 unsigned short commandsize; \
161 } DPSOFTRAST_Command_##name );
163 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
164 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
166 typedef ALIGN(struct DPSOFTRAST_State_Command_Pool_s
170 ALIGN(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
172 DPSOFTRAST_State_Command_Pool);
174 typedef ALIGN(struct DPSOFTRAST_State_Triangle_s
176 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
178 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
180 DPSOFTRAST_State_Triangle);
182 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
183 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
184 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
185 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
186 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
188 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
189 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
190 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
191 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
192 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
193 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
194 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
195 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
196 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
199 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
201 typedef ALIGN(struct DPSOFTRAST_State_Span_s
203 int triangle; // triangle this span was generated by
204 int x; // framebuffer x coord
205 int y; // framebuffer y coord
206 int startx; // usable range (according to pixelmask)
207 int endx; // usable range (according to pixelmask)
208 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
209 int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
210 int depthslope; // depthbuffer value pixel delta
212 DPSOFTRAST_State_Span);
214 #define DPSOFTRAST_DRAW_MAXSPANS 1024
215 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
216 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
218 #define DPSOFTRAST_VALIDATE_FB 1
219 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
220 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
221 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
223 typedef enum DPSOFTRAST_BLENDMODE_e
225 DPSOFTRAST_BLENDMODE_OPAQUE,
226 DPSOFTRAST_BLENDMODE_ALPHA,
227 DPSOFTRAST_BLENDMODE_ADDALPHA,
228 DPSOFTRAST_BLENDMODE_ADD,
229 DPSOFTRAST_BLENDMODE_INVMOD,
230 DPSOFTRAST_BLENDMODE_MUL,
231 DPSOFTRAST_BLENDMODE_MUL2,
232 DPSOFTRAST_BLENDMODE_SUBALPHA,
233 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
234 DPSOFTRAST_BLENDMODE_INVADD,
235 DPSOFTRAST_BLENDMODE_TOTAL
237 DPSOFTRAST_BLENDMODE;
239 typedef ALIGN(struct DPSOFTRAST_State_Thread_s
255 float polygonoffset[2];
257 ALIGN(float fb_clipplane[4]);
260 int shader_permutation;
261 int shader_exactspecularmath;
263 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
265 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
266 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
268 // DPSOFTRAST_VALIDATE_ flags
271 // derived values (DPSOFTRAST_VALIDATE_FB)
274 ALIGN(float fb_viewportcenter[4]);
275 ALIGN(float fb_viewportscale[4]);
277 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
280 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
289 ATOMIC(volatile int commandoffset);
291 volatile bool waiting;
292 volatile bool starving;
299 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
300 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
301 unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
303 DPSOFTRAST_State_Thread);
305 typedef ALIGN(struct DPSOFTRAST_State_s
309 unsigned int *fb_depthpixels;
310 unsigned int *fb_colorpixels[4];
313 ALIGN(float fb_viewportcenter[4]);
314 ALIGN(float fb_viewportscale[4]);
317 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
318 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
320 const float *pointer_vertex3f;
321 const float *pointer_color4f;
322 const unsigned char *pointer_color4ub;
323 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
326 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
327 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
328 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
332 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
333 float *screencoord4f;
339 int shader_permutation;
340 int shader_exactspecularmath;
344 int texture_firstfree;
345 DPSOFTRAST_Texture *texture;
350 const char *errorstring;
355 DPSOFTRAST_State_Thread *threads;
357 ATOMIC(volatile int drawcommand);
359 DPSOFTRAST_State_Command_Pool commandpool;
363 DPSOFTRAST_State dpsoftrast;
365 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
366 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
367 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
368 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
370 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
371 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
373 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
375 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
376 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
377 fb_viewportcenter[3] = 0.5f;
378 fb_viewportcenter[0] = 0.0f;
379 fb_viewportscale[1] = 0.5f * viewport[2];
380 fb_viewportscale[2] = -0.5f * viewport[3];
381 fb_viewportscale[3] = 0.5f;
382 fb_viewportscale[0] = 1.0f;
385 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
387 if (dpsoftrast.interlace)
389 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
390 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
391 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
392 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
396 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
397 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
401 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
403 thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
404 thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
405 thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
406 thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
407 thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
410 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
412 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
413 // and viewport projection values
416 x1 = thread->scissor[0];
417 x2 = thread->scissor[0] + thread->scissor[2];
418 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
419 y2 = dpsoftrast.fb_height - thread->scissor[1];
420 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
422 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
424 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
425 thread->fb_scissor[0] = x1;
426 thread->fb_scissor[1] = y1;
427 thread->fb_scissor[2] = x2 - x1;
428 thread->fb_scissor[3] = y2 - y1;
430 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
431 DPSOFTRAST_RecalcClipPlane(thread);
432 DPSOFTRAST_RecalcThread(thread);
435 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
437 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
440 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
442 if (thread->blendsubtract)
444 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
446 #define BLENDFUNC(sfactor, dfactor, blendmode) \
447 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
448 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
449 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
454 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
456 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
457 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
458 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
459 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
460 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
461 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
462 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
463 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
464 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
465 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
466 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
471 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
473 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
475 mask &= thread->validate;
478 if (mask & DPSOFTRAST_VALIDATE_FB)
480 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
481 DPSOFTRAST_RecalcFB(thread);
483 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
485 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
486 DPSOFTRAST_RecalcDepthFunc(thread);
488 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
490 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
491 DPSOFTRAST_RecalcBlendFunc(thread);
495 static DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
497 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
498 return &dpsoftrast.texture[index];
502 static void DPSOFTRAST_Texture_Grow(void)
504 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
505 DPSOFTRAST_State_Thread *thread;
509 // expand texture array as needed
510 if (dpsoftrast.texture_max < 1024)
511 dpsoftrast.texture_max = 1024;
513 dpsoftrast.texture_max *= 2;
514 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
515 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
516 if (dpsoftrast.texbound[i])
517 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
518 for (j = 0; j < dpsoftrast.numthreads; j++)
520 thread = &dpsoftrast.threads[j];
521 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
522 if (thread->texbound[i])
523 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
527 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
536 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
537 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
538 DPSOFTRAST_Texture *texture;
539 if (width*height*depth < 1)
541 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
544 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
546 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
551 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
552 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
553 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
555 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
556 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
558 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
563 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
566 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
568 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
573 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
575 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
578 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
580 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
583 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
585 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
588 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
590 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
593 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
595 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
598 // find first empty slot in texture array
599 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
600 if (!dpsoftrast.texture[texnum].bytes)
602 dpsoftrast.texture_firstfree = texnum + 1;
603 if (dpsoftrast.texture_max <= texnum)
604 DPSOFTRAST_Texture_Grow();
605 if (dpsoftrast.texture_end <= texnum)
606 dpsoftrast.texture_end = texnum + 1;
607 texture = &dpsoftrast.texture[texnum];
608 memset(texture, 0, sizeof(*texture));
609 texture->flags = flags;
610 texture->width = width;
611 texture->height = height;
612 texture->depth = depth;
613 texture->sides = sides;
625 s = w * h * d * sides * 4;
626 texture->mipmap[mipmaps][0] = size;
627 texture->mipmap[mipmaps][1] = s;
628 texture->mipmap[mipmaps][2] = w;
629 texture->mipmap[mipmaps][3] = h;
630 texture->mipmap[mipmaps][4] = d;
633 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
639 texture->mipmaps = mipmaps;
640 texture->size = size;
642 // allocate the pixels now
643 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
647 void DPSOFTRAST_Texture_Free(int index)
649 DPSOFTRAST_Texture *texture;
650 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
654 MM_FREE(texture->bytes);
655 texture->bytes = NULL;
656 memset(texture, 0, sizeof(*texture));
657 // adjust the free range and used range
658 if (dpsoftrast.texture_firstfree > index)
659 dpsoftrast.texture_firstfree = index;
660 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
661 dpsoftrast.texture_end--;
663 static void DPSOFTRAST_Texture_CalculateMipmaps(int index)
665 int i, x, y, z, w, layer0, layer1, row0, row1;
666 unsigned char *o, *i0, *i1, *i2, *i3;
667 DPSOFTRAST_Texture *texture;
668 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
669 if (texture->mipmaps <= 1)
671 for (i = 1;i < texture->mipmaps;i++)
673 for (z = 0;z < texture->mipmap[i][4];z++)
677 if (layer1 >= texture->mipmap[i-1][4])
678 layer1 = texture->mipmap[i-1][4]-1;
679 for (y = 0;y < texture->mipmap[i][3];y++)
683 if (row1 >= texture->mipmap[i-1][3])
684 row1 = texture->mipmap[i-1][3]-1;
685 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
686 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
687 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
688 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
689 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
690 w = texture->mipmap[i][2];
693 if (texture->mipmap[i-1][2] > 1)
695 // average 3D texture
696 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
698 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
699 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
700 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
701 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
706 // average 3D mipmap with parent width == 1
707 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
709 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
710 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
711 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
712 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
718 if (texture->mipmap[i-1][2] > 1)
720 // average 2D texture (common case)
721 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
723 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
724 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
725 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
726 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
731 // 2D texture with parent width == 1
732 o[0] = (i0[0] + i1[0] + 1) >> 1;
733 o[1] = (i0[1] + i1[1] + 1) >> 1;
734 o[2] = (i0[2] + i1[2] + 1) >> 1;
735 o[3] = (i0[3] + i1[3] + 1) >> 1;
742 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
744 DPSOFTRAST_Texture *texture;
746 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
751 dst = texture->bytes + texture->mipmap[0][1] +(-blocky * texture->mipmap[0][2] + blockx) * 4;
752 while (blockheight > 0)
754 dst -= texture->mipmap[0][2] * 4;
755 memcpy(dst, pixels, blockwidth * 4);
756 pixels += blockwidth * 4;
760 DPSOFTRAST_Texture_CalculateMipmaps(index);
762 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
764 DPSOFTRAST_Texture *texture;
765 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
770 int i, stride = texture->mipmap[0][2]*4;
771 unsigned char *dst = texture->bytes + texture->mipmap[0][1];
772 for (i = texture->mipmap[0][3];i > 0;i--)
775 memcpy(dst, pixels, stride);
779 DPSOFTRAST_Texture_CalculateMipmaps(index);
781 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
783 DPSOFTRAST_Texture *texture;
784 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
785 return texture->mipmap[mip][2];
787 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
789 DPSOFTRAST_Texture *texture;
790 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
791 return texture->mipmap[mip][3];
793 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
795 DPSOFTRAST_Texture *texture;
796 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
797 return texture->mipmap[mip][4];
799 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
801 DPSOFTRAST_Texture *texture;
802 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
805 return texture->bytes + texture->mipmap[mip][0];
807 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
809 DPSOFTRAST_Texture *texture;
810 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
811 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
813 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
818 texture->filter = filter;
821 static void DPSOFTRAST_Draw_FlushThreads(void);
823 static void DPSOFTRAST_Draw_SyncCommands(void)
825 if(dpsoftrast.usethreads) MEMORY_BARRIER;
826 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
829 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
831 DPSOFTRAST_State_Thread *thread;
833 int freecommand = dpsoftrast.commandpool.freecommand;
834 int usedcommands = dpsoftrast.commandpool.usedcommands;
835 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
837 DPSOFTRAST_Draw_SyncCommands();
843 for (i = 0; i < dpsoftrast.numthreads; i++)
845 thread = &dpsoftrast.threads[i];
846 commandoffset = freecommand - thread->commandoffset;
847 if (commandoffset < 0)
848 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
849 if (commandoffset > usedcommands)
852 usedcommands = commandoffset;
855 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
857 thread = &dpsoftrast.threads[waitindex];
858 Thread_LockMutex(thread->drawmutex);
859 if (thread->commandoffset != dpsoftrast.drawcommand)
861 thread->waiting = true;
862 if (thread->starving) Thread_CondSignal(thread->drawcond);
863 Thread_CondWait(thread->waitcond, thread->drawmutex);
864 thread->waiting = false;
866 Thread_UnlockMutex(thread->drawmutex);
868 dpsoftrast.commandpool.usedcommands = usedcommands;
871 #define DPSOFTRAST_ALIGNCOMMAND(size) \
872 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
873 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
874 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
876 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
878 DPSOFTRAST_Command *command;
879 int freecommand = dpsoftrast.commandpool.freecommand;
880 int usedcommands = dpsoftrast.commandpool.usedcommands;
881 int extra = sizeof(DPSOFTRAST_Command);
882 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
883 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
884 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
886 if (dpsoftrast.usethreads)
887 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
889 DPSOFTRAST_Draw_FlushThreads();
890 freecommand = dpsoftrast.commandpool.freecommand;
891 usedcommands = dpsoftrast.commandpool.usedcommands;
893 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
895 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
896 command->opcode = DPSOFTRAST_OPCODE_Reset;
897 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
900 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
901 command->opcode = opcode;
902 command->commandsize = size;
904 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
906 dpsoftrast.commandpool.freecommand = freecommand;
907 dpsoftrast.commandpool.usedcommands = usedcommands + size;
911 static void DPSOFTRAST_UndoCommand(int size)
913 int freecommand = dpsoftrast.commandpool.freecommand;
914 int usedcommands = dpsoftrast.commandpool.usedcommands;
917 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
918 usedcommands -= size;
919 dpsoftrast.commandpool.freecommand = freecommand;
920 dpsoftrast.commandpool.usedcommands = usedcommands;
923 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
924 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
926 thread->viewport[0] = command->x;
927 thread->viewport[1] = command->y;
928 thread->viewport[2] = command->width;
929 thread->viewport[3] = command->height;
930 thread->validate |= DPSOFTRAST_VALIDATE_FB;
932 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
934 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
937 command->width = width;
938 command->height = height;
940 dpsoftrast.viewport[0] = x;
941 dpsoftrast.viewport[1] = y;
942 dpsoftrast.viewport[2] = width;
943 dpsoftrast.viewport[3] = height;
944 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
947 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
948 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
950 int i, x1, y1, x2, y2, w, h, x, y;
951 int miny1, maxy1, miny2, maxy2;
955 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
956 miny1 = thread->miny1;
957 maxy1 = thread->maxy1;
958 miny2 = thread->miny2;
959 maxy2 = thread->maxy2;
960 x1 = thread->fb_scissor[0];
961 y1 = thread->fb_scissor[1];
962 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
963 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
964 if (y1 < miny1) y1 = miny1;
965 if (y2 > maxy2) y2 = maxy2;
970 // FIXME: honor fb_colormask?
971 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
972 for (i = 0;i < 4;i++)
974 if (!dpsoftrast.fb_colorpixels[i])
976 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
979 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
980 for (x = x1;x < x2;x++)
985 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
987 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
994 DEFCOMMAND(3, ClearDepth, float depth;)
995 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
997 int x1, y1, x2, y2, w, h, x, y;
998 int miny1, maxy1, miny2, maxy2;
1002 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
1003 miny1 = thread->miny1;
1004 maxy1 = thread->maxy1;
1005 miny2 = thread->miny2;
1006 maxy2 = thread->maxy2;
1007 x1 = thread->fb_scissor[0];
1008 y1 = thread->fb_scissor[1];
1009 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
1010 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
1011 if (y1 < miny1) y1 = miny1;
1012 if (y2 > maxy2) y2 = maxy2;
1017 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
1018 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
1019 for (;y < bandy;y++)
1021 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
1022 for (x = x1;x < x2;x++)
1026 void DPSOFTRAST_ClearDepth(float d)
1028 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1032 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1033 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1035 thread->colormask[0] = command->r != 0;
1036 thread->colormask[1] = command->g != 0;
1037 thread->colormask[2] = command->b != 0;
1038 thread->colormask[3] = command->a != 0;
1039 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1041 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1043 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1050 DEFCOMMAND(5, DepthTest, int enable;)
1051 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1053 thread->depthtest = command->enable;
1054 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1056 void DPSOFTRAST_DepthTest(int enable)
1058 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1059 command->enable = enable;
1062 DEFCOMMAND(6, ScissorTest, int enable;)
1063 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1065 thread->scissortest = command->enable;
1066 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1068 void DPSOFTRAST_ScissorTest(int enable)
1070 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1071 command->enable = enable;
1074 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1075 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1077 thread->scissor[0] = command->x;
1078 thread->scissor[1] = command->y;
1079 thread->scissor[2] = command->width;
1080 thread->scissor[3] = command->height;
1081 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1083 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1085 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1088 command->width = width;
1089 command->height = height;
1092 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1093 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1095 thread->blendfunc[0] = command->sfactor;
1096 thread->blendfunc[1] = command->dfactor;
1097 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1099 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1101 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1102 command->sfactor = sfactor;
1103 command->dfactor = dfactor;
1106 DEFCOMMAND(9, BlendSubtract, int enable;)
1107 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1109 thread->blendsubtract = command->enable;
1110 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1112 void DPSOFTRAST_BlendSubtract(int enable)
1114 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1115 command->enable = enable;
1118 DEFCOMMAND(10, DepthMask, int enable;)
1119 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1121 thread->depthmask = command->enable;
1123 void DPSOFTRAST_DepthMask(int enable)
1125 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1126 command->enable = enable;
1129 DEFCOMMAND(11, DepthFunc, int func;)
1130 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1132 thread->depthfunc = command->func;
1134 void DPSOFTRAST_DepthFunc(int func)
1136 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1137 command->func = func;
1140 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1141 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1143 thread->depthrange[0] = command->nearval;
1144 thread->depthrange[1] = command->farval;
1146 void DPSOFTRAST_DepthRange(float nearval, float farval)
1148 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1149 command->nearval = nearval;
1150 command->farval = farval;
1153 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1154 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1156 thread->polygonoffset[0] = command->alongnormal;
1157 thread->polygonoffset[1] = command->intoview;
1159 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1161 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1162 command->alongnormal = alongnormal;
1163 command->intoview = intoview;
1166 DEFCOMMAND(14, CullFace, int mode;)
1167 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1169 thread->cullface = command->mode;
1171 void DPSOFTRAST_CullFace(int mode)
1173 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1174 command->mode = mode;
1177 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1179 dpsoftrast.color[0] = r;
1180 dpsoftrast.color[1] = g;
1181 dpsoftrast.color[2] = b;
1182 dpsoftrast.color[3] = a;
1185 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1187 int outstride = blockwidth * 4;
1188 int instride = dpsoftrast.fb_width * 4;
1191 int bx2 = blockx + blockwidth;
1192 int by2 = blocky + blockheight;
1196 unsigned char *inpixels;
1200 if (bx1 < 0) bx1 = 0;
1201 if (by1 < 0) by1 = 0;
1202 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1203 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1205 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1206 if (dpsoftrast.bigendian)
1208 for (y = by1;y < by2;y++)
1210 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1211 o = (unsigned char *)outpixels + (y - by1) * outstride;
1212 for (x = bx1;x < bx2;x++)
1225 for (y = by1;y < by2;y++)
1227 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1228 o = (unsigned char *)outpixels + (y - by1) * outstride;
1234 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1238 int tx2 = tx + width;
1239 int ty2 = ty + height;
1242 int sx2 = sx + width;
1243 int sy2 = sy + height;
1253 unsigned int *spixels;
1254 unsigned int *tpixels;
1255 DPSOFTRAST_Texture *texture;
1256 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1257 if (mip < 0 || mip >= texture->mipmaps) return;
1259 spixels = dpsoftrast.fb_colorpixels[0];
1260 swidth = dpsoftrast.fb_width;
1261 sheight = dpsoftrast.fb_height;
1262 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1263 twidth = texture->mipmap[mip][2];
1264 theight = texture->mipmap[mip][3];
1265 if (tx1 < 0) tx1 = 0;
1266 if (ty1 < 0) ty1 = 0;
1267 if (tx2 > twidth) tx2 = twidth;
1268 if (ty2 > theight) ty2 = theight;
1269 if (sx1 < 0) sx1 = 0;
1270 if (sy1 < 0) sy1 = 0;
1271 if (sx2 > swidth) sx2 = swidth;
1272 if (sy2 > sheight) sy2 = sheight;
1277 if (tw > sw) tw = sw;
1278 if (th > sh) th = sh;
1279 if (tw < 1 || th < 1)
1281 sy1 = sheight - sy1 - th;
1282 ty1 = theight - ty1 - th;
1283 for (y = 0;y < th;y++)
1284 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1285 if (texture->mipmaps > 1)
1286 DPSOFTRAST_Texture_CalculateMipmaps(index);
1289 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1290 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1292 if (thread->texbound[command->unitnum])
1293 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1294 thread->texbound[command->unitnum] = command->texture;
1296 void DPSOFTRAST_SetTexture(int unitnum, int index)
1298 DPSOFTRAST_Command_SetTexture *command;
1299 DPSOFTRAST_Texture *texture;
1300 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1302 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1305 texture = DPSOFTRAST_Texture_GetByIndex(index);
1306 if (index && !texture)
1308 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1312 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1313 command->unitnum = unitnum;
1314 command->texture = texture;
1316 dpsoftrast.texbound[unitnum] = texture;
1318 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1321 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1323 dpsoftrast.pointer_vertex3f = vertex3f;
1324 dpsoftrast.stride_vertex = stride;
1326 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1328 dpsoftrast.pointer_color4f = color4f;
1329 dpsoftrast.pointer_color4ub = NULL;
1330 dpsoftrast.stride_color = stride;
1332 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1334 dpsoftrast.pointer_color4f = NULL;
1335 dpsoftrast.pointer_color4ub = color4ub;
1336 dpsoftrast.stride_color = stride;
1338 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1340 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1341 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1342 dpsoftrast.stride_texcoord[unitnum] = stride;
1345 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1346 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1348 thread->shader_mode = command->mode;
1349 thread->shader_permutation = command->permutation;
1350 thread->shader_exactspecularmath = command->exactspecularmath;
1352 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1354 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1355 command->mode = mode;
1356 command->permutation = permutation;
1357 command->exactspecularmath = exactspecularmath;
1359 dpsoftrast.shader_mode = mode;
1360 dpsoftrast.shader_permutation = permutation;
1361 dpsoftrast.shader_exactspecularmath = exactspecularmath;
1364 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1365 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1367 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1369 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1371 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1372 command->index = index;
1373 command->val[0] = v0;
1374 command->val[1] = v1;
1375 command->val[2] = v2;
1376 command->val[3] = v3;
1378 dpsoftrast.uniform4f[index*4+0] = v0;
1379 dpsoftrast.uniform4f[index*4+1] = v1;
1380 dpsoftrast.uniform4f[index*4+2] = v2;
1381 dpsoftrast.uniform4f[index*4+3] = v3;
1383 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1385 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1386 command->index = index;
1387 memcpy(command->val, v, sizeof(command->val));
1389 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1392 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1393 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1395 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1397 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1401 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1403 __m128 m0, m1, m2, m3;
1404 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1405 command->index = (DPSOFTRAST_UNIFORM)index;
1406 if (((size_t)v)&(ALIGN_SIZE-1))
1408 m0 = _mm_loadu_ps(v);
1409 m1 = _mm_loadu_ps(v+4);
1410 m2 = _mm_loadu_ps(v+8);
1411 m3 = _mm_loadu_ps(v+12);
1415 m0 = _mm_load_ps(v);
1416 m1 = _mm_load_ps(v+4);
1417 m2 = _mm_load_ps(v+8);
1418 m3 = _mm_load_ps(v+12);
1422 __m128 t0, t1, t2, t3;
1423 t0 = _mm_unpacklo_ps(m0, m1);
1424 t1 = _mm_unpacklo_ps(m2, m3);
1425 t2 = _mm_unpackhi_ps(m0, m1);
1426 t3 = _mm_unpackhi_ps(m2, m3);
1427 m0 = _mm_movelh_ps(t0, t1);
1428 m1 = _mm_movehl_ps(t1, t0);
1429 m2 = _mm_movelh_ps(t2, t3);
1430 m3 = _mm_movehl_ps(t3, t2);
1432 _mm_store_ps(command->val, m0);
1433 _mm_store_ps(command->val+4, m1);
1434 _mm_store_ps(command->val+8, m2);
1435 _mm_store_ps(command->val+12, m3);
1436 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1437 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1438 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1439 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1444 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1445 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1447 thread->uniform1i[command->index] = command->val;
1449 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1451 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1452 command->index = index;
1455 dpsoftrast.uniform1i[command->index] = i0;
1458 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1459 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1461 memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1462 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1464 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1466 DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1467 command->clipplane[0] = x;
1468 command->clipplane[1] = y;
1469 command->clipplane[2] = z;
1470 command->clipplane[3] = w;
1474 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1476 float *end = dst + size*4;
1477 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1481 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1490 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1497 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1499 float *end = dst + size*4;
1500 if (stride == sizeof(float[3]))
1502 float *end4 = dst + (size&~3)*4;
1503 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1507 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1508 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1509 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1510 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1511 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1512 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1513 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1514 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1515 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1516 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1517 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1518 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1519 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1521 src += 4*sizeof(float[3]);
1528 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1529 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1530 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1531 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1532 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1533 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1534 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1535 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1536 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1537 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1538 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1539 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1540 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1542 src += 4*sizeof(float[3]);
1546 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1550 __m128 v = _mm_loadu_ps((const float *)src);
1551 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1552 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1553 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1554 _mm_store_ps(dst, v);
1563 __m128 v = _mm_load_ps((const float *)src);
1564 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1565 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1566 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1567 _mm_store_ps(dst, v);
1574 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1576 float *end = dst + size*4;
1577 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1578 if (stride == sizeof(float[2]))
1580 float *end2 = dst + (size&~1)*4;
1581 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1585 __m128 v = _mm_loadu_ps((const float *)src);
1586 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1587 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1589 src += 2*sizeof(float[2]);
1596 __m128 v = _mm_load_ps((const float *)src);
1597 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1598 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1600 src += 2*sizeof(float[2]);
1606 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1612 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1614 float *end = dst + size*4;
1615 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1616 if (stride == sizeof(unsigned char[4]))
1618 float *end4 = dst + (size&~3)*4;
1619 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1623 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1624 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1625 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1626 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1627 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1629 src += 4*sizeof(unsigned char[4]);
1636 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1637 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1638 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1639 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1640 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1642 src += 4*sizeof(unsigned char[4]);
1648 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1649 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1655 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1657 float *end = dst + 4*size;
1658 __m128 v = _mm_loadu_ps(src);
1661 _mm_store_ps(dst, v);
1667 static void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1670 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1671 __m128 m0, m1, m2, m3;
1673 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1675 // fast case for identity matrix
1676 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1679 end = out4f + numitems*4;
1680 m0 = _mm_loadu_ps(inmatrix16f);
1681 m1 = _mm_loadu_ps(inmatrix16f + 4);
1682 m2 = _mm_loadu_ps(inmatrix16f + 8);
1683 m3 = _mm_loadu_ps(inmatrix16f + 12);
1684 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1688 __m128 v = _mm_loadu_ps(in4f);
1690 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1691 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1692 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1693 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1702 __m128 v = _mm_load_ps(in4f);
1704 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1705 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1706 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1707 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1716 static void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1718 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1723 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1725 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1726 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1727 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1728 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1731 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1733 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1734 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1735 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1736 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1739 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1742 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1743 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1744 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1745 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1748 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1750 int clipmask = 0xFF;
1751 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1752 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1753 __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1754 __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1755 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1756 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1757 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1758 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1759 #define BBFRONT(k, pos) \
1761 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1762 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1763 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1766 clipmask &= ~(1<<k); \
1767 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1768 minproj = _mm_min_ss(minproj, proj); \
1769 maxproj = _mm_max_ss(maxproj, proj); \
1773 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1774 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1775 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1776 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1777 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1778 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1782 if (clipmask&(1<<k)) \
1784 if (!(clipmask&(1<<(k^1)))) \
1786 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1787 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1788 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1789 minproj = _mm_min_ss(minproj, proj); \
1790 maxproj = _mm_max_ss(maxproj, proj); \
1792 if (!(clipmask&(1<<(k^2)))) \
1794 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1795 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1796 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1797 minproj = _mm_min_ss(minproj, proj); \
1798 maxproj = _mm_max_ss(maxproj, proj); \
1800 if (!(clipmask&(1<<(k^4)))) \
1802 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1803 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1804 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1805 minproj = _mm_min_ss(minproj, proj); \
1806 maxproj = _mm_max_ss(maxproj, proj); \
1810 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1811 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1812 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1813 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1814 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1815 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1816 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1817 *starty = _mm_cvttss_si32(maxproj);
1818 *endy = _mm_cvttss_si32(minproj)+1;
1822 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1824 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1825 float *end = out4f + numitems*4;
1826 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1827 __m128 minpos, maxpos;
1828 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1830 minpos = maxpos = _mm_loadu_ps(in4f);
1833 __m128 v = _mm_loadu_ps(in4f);
1834 minpos = _mm_min_ps(minpos, v);
1835 maxpos = _mm_max_ps(maxpos, v);
1836 _mm_store_ps(out4f, v);
1837 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1838 _mm_store_ps(screen4f, v);
1846 minpos = maxpos = _mm_load_ps(in4f);
1849 __m128 v = _mm_load_ps(in4f);
1850 minpos = _mm_min_ps(minpos, v);
1851 maxpos = _mm_max_ps(maxpos, v);
1852 _mm_store_ps(out4f, v);
1853 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1854 _mm_store_ps(screen4f, v);
1862 ALIGN(float minposf[4]);
1863 ALIGN(float maxposf[4]);
1864 _mm_store_ps(minposf, minpos);
1865 _mm_store_ps(maxposf, maxpos);
1866 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1871 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1873 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1874 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1876 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1877 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1878 end = out4f + numitems*4;
1879 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1880 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1881 m0 = _mm_loadu_ps(inmatrix16f);
1882 m1 = _mm_loadu_ps(inmatrix16f + 4);
1883 m2 = _mm_loadu_ps(inmatrix16f + 8);
1884 m3 = _mm_loadu_ps(inmatrix16f + 12);
1885 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1887 minpos = maxpos = _mm_loadu_ps(in4f);
1890 __m128 v = _mm_loadu_ps(in4f);
1891 minpos = _mm_min_ps(minpos, v);
1892 maxpos = _mm_max_ps(maxpos, v);
1893 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1894 _mm_store_ps(out4f, v);
1895 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1896 _mm_store_ps(screen4f, v);
1904 minpos = maxpos = _mm_load_ps(in4f);
1907 __m128 v = _mm_load_ps(in4f);
1908 minpos = _mm_min_ps(minpos, v);
1909 maxpos = _mm_max_ps(maxpos, v);
1910 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1911 _mm_store_ps(out4f, v);
1912 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1913 _mm_store_ps(screen4f, v);
1921 ALIGN(float minposf[4]);
1922 ALIGN(float maxposf[4]);
1923 _mm_store_ps(minposf, minpos);
1924 _mm_store_ps(maxposf, maxpos);
1925 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f);
1931 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1934 float *outf = dpsoftrast.post_array4f[outarray];
1935 const unsigned char *inb;
1936 int firstvertex = dpsoftrast.firstvertex;
1937 int numvertices = dpsoftrast.numvertices;
1941 case DPSOFTRAST_ARRAY_POSITION:
1942 stride = dpsoftrast.stride_vertex;
1943 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1944 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1946 case DPSOFTRAST_ARRAY_COLOR:
1947 stride = dpsoftrast.stride_color;
1948 if (dpsoftrast.pointer_color4f)
1950 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1951 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1953 else if (dpsoftrast.pointer_color4ub)
1955 stride = dpsoftrast.stride_color;
1956 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1957 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1961 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1965 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1966 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1968 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1969 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1972 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1975 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1978 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1990 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1992 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1993 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1998 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
2001 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2002 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
2010 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2013 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2014 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2021 static void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2024 int startx = span->startx;
2025 int endx = span->endx;
2026 float wslope = triangle->w[0];
2027 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2028 float endz = 1.0f / (w + wslope * startx);
2029 if (triangle->w[0] == 0)
2031 // LordHavoc: fast flat polygons (HUD/menu)
2032 for (x = startx;x < endx;x++)
2036 for (x = startx;x < endx;)
2038 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2040 if (nextsub >= endx) nextsub = endsub = endx-1;
2041 endz = 1.0f / (w + wslope * nextsub);
2042 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2043 for (; x <= endsub; x++, z += dz)
2048 static void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2052 int startx = span->startx;
2053 int endx = span->endx;
2056 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2057 unsigned char * RESTRICT pixelmask = span->pixelmask;
2058 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2059 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2062 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2063 pixeli += span->y * dpsoftrast.fb_width + span->x;
2064 // handle alphatest now (this affects depth writes too)
2065 if (thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL)
2066 for (x = startx;x < endx;x++)
2067 if (in4ub[x*4+3] < 128)
2068 pixelmask[x] = false;
2069 // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2070 // helps sprites, text and hud artwork
2071 switch(thread->fb_blendmode)
2073 case DPSOFTRAST_BLENDMODE_ALPHA:
2074 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2075 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2077 for (x = startx;x < endx;x++)
2079 if (in4ub[x*4+3] >= 1)
2084 while (++x < endx && in4ub[x*4+3] >= 1) ;
2086 if (x >= endx) break;
2088 while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2089 if (x >= endx) break;
2096 case DPSOFTRAST_BLENDMODE_OPAQUE:
2097 case DPSOFTRAST_BLENDMODE_ADD:
2098 case DPSOFTRAST_BLENDMODE_INVMOD:
2099 case DPSOFTRAST_BLENDMODE_MUL:
2100 case DPSOFTRAST_BLENDMODE_MUL2:
2101 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2102 case DPSOFTRAST_BLENDMODE_INVADD:
2105 // put some special values at the end of the mask to ensure the loops end
2106 pixelmask[endx] = 1;
2107 pixelmask[endx+1] = 0;
2108 // LordHavoc: use a double loop to identify subspans, this helps the
2109 // optimized copy/blend loops to perform at their best, most triangles
2110 // have only one run of pixels, and do the search using wide reads...
2114 // if this pixel is masked off, it's probably not alone...
2121 // the 4-item search must be aligned or else it stalls badly
2122 if ((x & 3) && !pixelmask[x])
2124 if(pixelmask[x]) goto endmasked;
2128 if(pixelmask[x]) goto endmasked;
2132 if(pixelmask[x]) goto endmasked;
2137 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2141 for (;!pixelmask[x];x++)
2143 // rather than continue the loop, just check the end variable
2148 // find length of subspan
2151 if (subx + 8 < endx)
2155 if(!pixelmask[subx]) goto endunmasked;
2159 if(!pixelmask[subx]) goto endunmasked;
2163 if(!pixelmask[subx]) goto endunmasked;
2168 while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2172 for (;pixelmask[subx];subx++)
2174 // the checks can overshoot, so make sure to clip it...
2178 // now that we know the subspan length... process!
2179 switch(thread->fb_blendmode)
2181 case DPSOFTRAST_BLENDMODE_OPAQUE:
2185 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2190 while (x + 16 <= subx)
2192 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2193 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2194 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2195 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2200 while (x + 4 <= subx)
2202 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2208 pixeli[x+1] = ini[x+1];
2218 case DPSOFTRAST_BLENDMODE_ALPHA:
2219 #define FINISHBLEND(blend2, blend1) \
2220 for (;x + 1 < subx;x += 2) \
2223 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2224 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2226 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2231 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2232 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2234 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2238 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2239 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2241 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2242 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2245 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2247 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2248 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2250 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2251 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2254 case DPSOFTRAST_BLENDMODE_ADD:
2255 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2257 case DPSOFTRAST_BLENDMODE_INVMOD:
2259 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2261 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2264 case DPSOFTRAST_BLENDMODE_MUL:
2265 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2267 case DPSOFTRAST_BLENDMODE_MUL2:
2268 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2270 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2272 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2273 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2275 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2276 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2279 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2281 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2282 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2284 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2285 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2288 case DPSOFTRAST_BLENDMODE_INVADD:
2290 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2292 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2300 static void DPSOFTRAST_Texture2DBGRA8(DPSOFTRAST_Texture *texture, int mip, float x, float y, unsigned char c[4])
2301 // warning: this is SLOW, only use if the optimized per-span functions won't do
2303 const unsigned char * RESTRICT pixelbase;
2304 const unsigned char * RESTRICT pixel[4];
2305 int width = texture->mipmap[mip][2], height = texture->mipmap[mip][3];
2306 int wrapmask[2] = { width-1, height-1 };
2307 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*width;
2308 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
2310 unsigned int tc[2] = { x * (width<<12) - 2048, y * (height<<12) - 2048};
2311 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2312 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2313 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2314 int tci[2] = { tc[0]>>12, tc[1]>>12 };
2315 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2316 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2318 tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2319 tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2320 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= wrapmask[0] ? tci1[0] : wrapmask[0]) : 0;
2321 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= wrapmask[1] ? tci1[1] : wrapmask[1]) : 0;
2325 tci[0] &= wrapmask[0];
2326 tci[1] &= wrapmask[1];
2327 tci1[0] &= wrapmask[0];
2328 tci1[1] &= wrapmask[1];
2330 pixel[0] = pixelbase + 4 * (tci[0] - tci[1]*width);
2331 pixel[1] = pixelbase + 4 * (tci[0] - tci[1]*width);
2332 pixel[2] = pixelbase + 4 * (tci[0] - tci1[1]*width);
2333 pixel[3] = pixelbase + 4 * (tci[0] - tci1[1]*width);
2334 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
2335 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
2336 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
2337 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
2341 int tci[2] = { x * width, y * height };
2342 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2344 tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2345 tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2349 tci[0] &= wrapmask[0];
2350 tci[1] &= wrapmask[1];
2352 pixel[0] = pixelbase + 4 * (tci[0] - tci[1]*width);
2361 static void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2364 int startx = span->startx;
2365 int endx = span->endx;
2370 float tc[2], endtc[2];
2372 unsigned int tci[2];
2373 unsigned int tci1[2];
2374 unsigned int tcimin[2];
2375 unsigned int tcimax[2];
2380 const unsigned char * RESTRICT pixelbase;
2381 const unsigned char * RESTRICT pixel[4];
2382 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2383 // if no texture is bound, just fill it with white
2386 for (x = startx;x < endx;x++)
2388 out4f[x*4+0] = 1.0f;
2389 out4f[x*4+1] = 1.0f;
2390 out4f[x*4+2] = 1.0f;
2391 out4f[x*4+3] = 1.0f;
2395 mip = triangle->mip[texunitindex];
2396 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*texture->mipmap[mip][2];
2397 // if this mipmap of the texture is 1 pixel, just fill it with that color
2398 if (texture->mipmap[mip][1] == 4)
2400 c[0] = texture->bytes[2] * (1.0f/255.0f);
2401 c[1] = texture->bytes[1] * (1.0f/255.0f);
2402 c[2] = texture->bytes[0] * (1.0f/255.0f);
2403 c[3] = texture->bytes[3] * (1.0f/255.0f);
2404 for (x = startx;x < endx;x++)
2406 out4f[x*4+0] = c[0];
2407 out4f[x*4+1] = c[1];
2408 out4f[x*4+2] = c[2];
2409 out4f[x*4+3] = c[3];
2413 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2414 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2415 flags = texture->flags;
2416 tcscale[0] = texture->mipmap[mip][2];
2417 tcscale[1] = texture->mipmap[mip][3];
2418 tciwidth = -texture->mipmap[mip][2];
2421 tcimax[0] = texture->mipmap[mip][2]-1;
2422 tcimax[1] = texture->mipmap[mip][3]-1;
2423 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2424 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2425 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2426 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2432 for (x = startx;x < endx;)
2434 unsigned int subtc[2];
2435 unsigned int substep[2];
2436 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2437 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2438 if (nextsub >= endx)
2440 nextsub = endsub = endx-1;
2441 if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2445 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2446 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2452 substep[0] = (endtc[0] - tc[0]) * subscale;
2453 substep[1] = (endtc[1] - tc[1]) * subscale;
2454 subtc[0] = tc[0] * (1<<12);
2455 subtc[1] = tc[1] * (1<<12);
2458 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2460 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2462 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2463 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2464 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2465 tci[0] = subtc[0]>>12;
2466 tci[1] = subtc[1]>>12;
2467 tci1[0] = tci[0] + 1;
2468 tci1[1] = tci[1] + 1;
2469 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2470 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2471 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2472 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2473 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2474 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2475 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2476 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2477 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2478 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2479 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2480 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2481 out4f[x*4+0] = c[0];
2482 out4f[x*4+1] = c[1];
2483 out4f[x*4+2] = c[2];
2484 out4f[x*4+3] = c[3];
2489 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2491 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2492 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2493 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2494 tci[0] = subtc[0]>>12;
2495 tci[1] = subtc[1]>>12;
2496 tci1[0] = tci[0] + 1;
2497 tci1[1] = tci[1] + 1;
2498 tci[0] &= tciwrapmask[0];
2499 tci[1] &= tciwrapmask[1];
2500 tci1[0] &= tciwrapmask[0];
2501 tci1[1] &= tciwrapmask[1];
2502 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2503 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2504 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2505 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2506 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2507 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2508 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2509 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2510 out4f[x*4+0] = c[0];
2511 out4f[x*4+1] = c[1];
2512 out4f[x*4+2] = c[2];
2513 out4f[x*4+3] = c[3];
2517 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2519 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2521 tci[0] = subtc[0]>>12;
2522 tci[1] = subtc[1]>>12;
2523 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2524 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2525 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2526 c[0] = pixel[0][2] * (1.0f / 255.0f);
2527 c[1] = pixel[0][1] * (1.0f / 255.0f);
2528 c[2] = pixel[0][0] * (1.0f / 255.0f);
2529 c[3] = pixel[0][3] * (1.0f / 255.0f);
2530 out4f[x*4+0] = c[0];
2531 out4f[x*4+1] = c[1];
2532 out4f[x*4+2] = c[2];
2533 out4f[x*4+3] = c[3];
2538 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2540 tci[0] = subtc[0]>>12;
2541 tci[1] = subtc[1]>>12;
2542 tci[0] &= tciwrapmask[0];
2543 tci[1] &= tciwrapmask[1];
2544 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2545 c[0] = pixel[0][2] * (1.0f / 255.0f);
2546 c[1] = pixel[0][1] * (1.0f / 255.0f);
2547 c[2] = pixel[0][0] * (1.0f / 255.0f);
2548 c[3] = pixel[0][3] * (1.0f / 255.0f);
2549 out4f[x*4+0] = c[0];
2550 out4f[x*4+1] = c[1];
2551 out4f[x*4+2] = c[2];
2552 out4f[x*4+3] = c[3];
2559 static void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2563 int startx = span->startx;
2564 int endx = span->endx;
2566 __m128 data, slope, tcscale;
2567 __m128i tcsize, tcmask, tcoffset, tcmax;
2569 __m128i subtc, substep, endsubtc;
2572 int affine; // LordHavoc: optimized affine texturing case
2573 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2574 const unsigned char * RESTRICT pixelbase;
2575 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2576 // if no texture is bound, just fill it with white
2579 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2582 mip = triangle->mip[texunitindex];
2583 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*texture->mipmap[mip][2];
2584 // if this mipmap of the texture is 1 pixel, just fill it with that color
2585 if (texture->mipmap[mip][1] == 4)
2587 unsigned int k = *((const unsigned int *)pixelbase);
2588 for (x = startx;x < endx;x++)
2592 affine = zf[startx] == zf[endx-1];
2593 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2594 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2595 flags = texture->flags;
2596 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2597 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2598 tcscale = _mm_cvtepi32_ps(tcsize);
2599 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2600 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2601 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2603 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2604 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2605 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_sub_epi32(_mm_setzero_si128(), _mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0))), 18), _mm_set1_epi32(4));
2606 tcmax = _mm_packs_epi32(tcmask, tcmask);
2607 for (x = startx;x < endx;)
2609 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2610 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2611 if (nextsub >= endx || affine)
2613 nextsub = endsub = endx-1;
2614 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2618 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2620 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2621 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2622 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2623 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2624 substep = _mm_slli_epi32(substep, 1);
2627 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2628 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2630 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2631 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2633 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2634 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2635 tci = _mm_madd_epi16(tci, tcoffset);
2636 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2637 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2638 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2639 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2640 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2641 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2642 fracm = _mm_srli_epi16(subtc, 1);
2643 pix1 = _mm_add_epi16(pix1,
2644 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2645 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2646 pix3 = _mm_add_epi16(pix3,
2647 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2648 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2649 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2650 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2651 pix2 = _mm_add_epi16(pix2,
2652 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2653 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2654 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2658 const unsigned char * RESTRICT ptr1;
2659 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2660 tci = _mm_madd_epi16(tci, tcoffset);
2661 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2662 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2663 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2664 fracm = _mm_srli_epi16(subtc, 1);
2665 pix1 = _mm_add_epi16(pix1,
2666 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2667 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2668 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2669 pix1 = _mm_add_epi16(pix1,
2670 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2671 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2672 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2676 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2678 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2680 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2681 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2682 tci = _mm_madd_epi16(tci, tcoffset);
2683 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2684 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2685 _mm_setzero_si128());
2686 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2687 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2688 _mm_setzero_si128());
2689 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2690 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2691 tci = _mm_madd_epi16(tci, tcoffset);
2692 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2693 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2694 _mm_setzero_si128());
2695 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2696 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2697 _mm_setzero_si128());
2698 fracm = _mm_srli_epi16(subtc, 1);
2699 pix1 = _mm_add_epi16(pix1,
2700 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2701 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2702 pix3 = _mm_add_epi16(pix3,
2703 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2704 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2705 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2706 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2707 pix2 = _mm_add_epi16(pix2,
2708 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2709 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2710 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2714 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2715 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2716 tci = _mm_madd_epi16(tci, tcoffset);
2717 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2718 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2719 _mm_setzero_si128());
2720 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2721 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2722 _mm_setzero_si128());
2723 fracm = _mm_srli_epi16(subtc, 1);
2724 pix1 = _mm_add_epi16(pix1,
2725 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2726 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2727 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2728 pix1 = _mm_add_epi16(pix1,
2729 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2730 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2731 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2737 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2739 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2740 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2741 tci = _mm_madd_epi16(tci, tcoffset);
2742 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2743 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2744 _mm_setzero_si128());
2745 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2746 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2747 _mm_setzero_si128());
2748 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2749 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2750 tci = _mm_madd_epi16(tci, tcoffset);
2751 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2752 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2753 _mm_setzero_si128());
2754 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2755 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2756 _mm_setzero_si128());
2757 fracm = _mm_srli_epi16(subtc, 1);
2758 pix1 = _mm_add_epi16(pix1,
2759 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2760 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2761 pix3 = _mm_add_epi16(pix3,
2762 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2763 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2764 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2765 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2766 pix2 = _mm_add_epi16(pix2,
2767 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2768 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2769 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2773 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2774 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2775 tci = _mm_madd_epi16(tci, tcoffset);
2776 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2777 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2778 _mm_setzero_si128());
2779 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2780 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2781 _mm_setzero_si128());
2782 fracm = _mm_srli_epi16(subtc, 1);
2783 pix1 = _mm_add_epi16(pix1,
2784 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2785 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2786 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2787 pix1 = _mm_add_epi16(pix1,
2788 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2789 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2790 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2797 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2799 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2801 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2802 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2803 tci = _mm_madd_epi16(tci, tcoffset);
2804 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2805 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2809 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2810 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2811 tci = _mm_madd_epi16(tci, tcoffset);
2812 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2818 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2820 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2821 tci = _mm_and_si128(tci, tcmax);
2822 tci = _mm_madd_epi16(tci, tcoffset);
2823 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2824 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2828 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2829 tci = _mm_and_si128(tci, tcmax);
2830 tci = _mm_madd_epi16(tci, tcoffset);
2831 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2840 static void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2843 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2846 static float DPSOFTRAST_SampleShadowmap(const float *vector)
2853 static void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2856 int startx = span->startx;
2857 int endx = span->endx;
2862 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2863 for (x = startx;x < endx;x++)
2866 c[0] = (data[0] + slope[0]*x) * z;
2867 c[1] = (data[1] + slope[1]*x) * z;
2868 c[2] = (data[2] + slope[2]*x) * z;
2869 c[3] = (data[3] + slope[3]*x) * z;
2870 out4f[x*4+0] = in4f[x*4+0] * c[0];
2871 out4f[x*4+1] = in4f[x*4+1] * c[1];
2872 out4f[x*4+2] = in4f[x*4+2] * c[2];
2873 out4f[x*4+3] = in4f[x*4+3] * c[3];
2879 static void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2882 int startx = span->startx;
2883 int endx = span->endx;
2888 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2889 for (x = startx;x < endx;x++)
2892 c[0] = (data[0] + slope[0]*x) * z;
2893 c[1] = (data[1] + slope[1]*x) * z;
2894 c[2] = (data[2] + slope[2]*x) * z;
2895 c[3] = (data[3] + slope[3]*x) * z;
2896 out4f[x*4+0] = c[0];
2897 out4f[x*4+1] = c[1];
2898 out4f[x*4+2] = c[2];
2899 out4f[x*4+3] = c[3];
2905 static void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2907 int x, startx = span->startx, endx = span->endx;
2908 float c[4], localcolor[4];
2909 localcolor[0] = subcolor[0];
2910 localcolor[1] = subcolor[1];
2911 localcolor[2] = subcolor[2];
2912 localcolor[3] = subcolor[3];
2913 for (x = startx;x < endx;x++)
2915 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2916 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2917 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2918 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2919 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2920 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2921 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2922 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2928 static void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2930 int x, startx = span->startx, endx = span->endx;
2931 for (x = startx;x < endx;x++)
2933 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2934 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2935 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2936 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2942 static void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2944 int x, startx = span->startx, endx = span->endx;
2945 for (x = startx;x < endx;x++)
2947 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2948 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2949 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2950 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2956 static void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2958 int x, startx = span->startx, endx = span->endx;
2960 for (x = startx;x < endx;x++)
2962 a = 1.0f - inb4f[x*4+3];
2964 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2965 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2966 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2967 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2973 static void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2975 int x, startx = span->startx, endx = span->endx;
2976 float localcolor[4], ilerp, lerp;
2977 localcolor[0] = color[0];
2978 localcolor[1] = color[1];
2979 localcolor[2] = color[2];
2980 localcolor[3] = color[3];
2981 ilerp = 1.0f - localcolor[3];
2982 lerp = localcolor[3];
2983 for (x = startx;x < endx;x++)
2985 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2986 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2987 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2988 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2995 static void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2999 int startx = span->startx;
3000 int endx = span->endx;
3003 __m128i submod, substep, endsubmod;
3004 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3005 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3006 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3007 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3008 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3009 for (x = startx; x < endx;)
3011 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3012 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3013 if (nextsub >= endx)
3015 nextsub = endsub = endx-1;
3016 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
3020 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3021 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3022 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3023 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3024 substep = _mm_packs_epi32(substep, substep);
3025 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3027 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3028 pix = _mm_mulhi_epu16(pix, submod);
3029 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3033 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3034 pix = _mm_mulhi_epu16(pix, submod);
3035 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3042 static void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3046 int startx = span->startx;
3047 int endx = span->endx;
3050 __m128i submod, substep, endsubmod;
3051 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3052 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3053 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3054 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3055 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3056 for (x = startx; x < endx;)
3058 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3059 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3060 if (nextsub >= endx)
3062 nextsub = endsub = endx-1;
3063 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3067 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3068 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3069 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3070 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3071 substep = _mm_packs_epi32(substep, substep);
3072 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3074 __m128i pix = _mm_srai_epi16(submod, 4);
3075 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3079 __m128i pix = _mm_srai_epi16(submod, 4);
3080 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3087 static void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3090 int x, startx = span->startx, endx = span->endx;
3091 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3092 localcolor = _mm_packs_epi32(localcolor, localcolor);
3093 for (x = startx;x+2 <= endx;x+=2)
3095 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3096 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3097 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3098 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3102 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3103 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3104 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3105 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3110 static void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3113 int x, startx = span->startx, endx = span->endx;
3114 for (x = startx;x+2 <= endx;x+=2)
3116 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3117 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3118 pix1 = _mm_mulhi_epu16(pix1, pix2);
3119 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3123 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3124 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3125 pix1 = _mm_mulhi_epu16(pix1, pix2);
3126 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3131 static void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3134 int x, startx = span->startx, endx = span->endx;
3135 for (x = startx;x+2 <= endx;x+=2)
3137 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3138 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3139 pix1 = _mm_add_epi16(pix1, pix2);
3140 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3144 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3145 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3146 pix1 = _mm_add_epi16(pix1, pix2);
3147 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3153 static void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3156 int x, startx = span->startx, endx = span->endx;
3157 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3158 tint = _mm_packs_epi32(tint, tint);
3159 for (x = startx;x+2 <= endx;x+=2)
3161 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3162 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3163 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3164 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3168 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3169 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3170 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3171 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3177 static void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3180 int x, startx = span->startx, endx = span->endx;
3181 for (x = startx;x+2 <= endx;x+=2)
3183 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3184 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3185 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3186 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3187 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3191 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3192 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3193 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3194 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3195 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3200 static void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3203 int x, startx = span->startx, endx = span->endx;
3204 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3205 localcolor = _mm_packs_epi32(localcolor, localcolor);
3206 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3207 for (x = startx;x+2 <= endx;x+=2)
3209 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3210 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3211 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3215 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3216 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3217 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3224 static void DPSOFTRAST_VertexShader_Generic(void)
3226 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3227 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3228 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3229 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3230 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3233 static void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3235 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3236 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3237 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3238 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3239 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3240 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3242 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3243 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3244 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3246 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3247 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3250 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3252 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3255 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3257 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3260 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3265 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3266 if(thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL)
3269 for (x = span->startx;x < span->endx;x++)
3270 buffer_FragColorbgra8[x*4+3] = buffer_FragColorbgra8[x*4+3] * thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3272 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3277 static void DPSOFTRAST_VertexShader_PostProcess(void)
3279 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3280 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3281 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3284 static void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3286 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3287 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3288 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3289 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3290 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3291 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3292 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3294 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3295 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3297 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3298 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3300 // TODO: implement saturation
3302 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3304 // TODO: implement gammaramps
3306 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3311 static void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3313 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3316 static void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3318 // this is never called (because colormask is off when this shader is used)
3319 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3320 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3321 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3322 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3323 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3328 static void DPSOFTRAST_VertexShader_FlatColor(void)
3330 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3331 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3334 static void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3337 unsigned char * RESTRICT pixelmask = span->pixelmask;
3338 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3339 int x, startx = span->startx, endx = span->endx;
3340 __m128i Color_Ambientm;
3341 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3342 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3343 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3344 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3345 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3346 if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3347 pixel = buffer_FragColorbgra8;
3348 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3349 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3350 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3351 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3352 for (x = startx;x < endx;x++)
3355 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3358 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3359 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3360 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3361 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3367 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3368 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3369 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3371 if (pixel == buffer_FragColorbgra8)
3372 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3378 static void DPSOFTRAST_VertexShader_VertexColor(void)
3380 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3381 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3382 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3385 static void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3388 unsigned char * RESTRICT pixelmask = span->pixelmask;
3389 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3390 int x, startx = span->startx, endx = span->endx;
3391 __m128i Color_Ambientm, Color_Diffusem;
3393 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3394 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3395 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3396 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3397 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3398 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3399 if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3400 pixel = buffer_FragColorbgra8;
3401 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3402 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3403 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3404 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3405 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3406 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3407 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3408 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3409 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3410 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3411 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3412 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3413 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3414 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3416 __m128i color, mod, pix;
3417 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3420 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3421 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3422 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3423 data = _mm_add_ps(data, slope);
3424 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3425 data = _mm_add_ps(data, slope);
3426 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3427 data = _mm_add_ps(data, slope);
3428 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3429 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3430 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3431 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3432 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3433 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3439 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3440 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3441 mod = _mm_packs_epi32(mod, mod);
3442 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3443 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3445 if (pixel == buffer_FragColorbgra8)
3446 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3452 static void DPSOFTRAST_VertexShader_Lightmap(void)
3454 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3455 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3456 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3459 static void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3462 unsigned char * RESTRICT pixelmask = span->pixelmask;
3463 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3464 int x, startx = span->startx, endx = span->endx;
3465 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3466 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3467 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3468 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3469 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3470 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3471 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3472 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3473 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3474 if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3475 pixel = buffer_FragColorbgra8;
3476 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3477 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3478 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3479 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3480 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3481 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3482 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3483 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3485 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3486 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3487 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3488 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3489 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3490 for (x = startx;x < endx;x++)
3492 __m128i color, lightmap, glow, pix;
3493 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3496 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3497 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3498 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3499 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3500 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3501 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3502 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3503 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3504 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3505 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3511 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3512 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3513 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3514 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3515 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3516 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3521 for (x = startx;x < endx;x++)
3523 __m128i color, lightmap, pix;
3524 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3527 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3528 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3529 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3530 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3531 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3532 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3533 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3539 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3540 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3541 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3542 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3545 if (pixel == buffer_FragColorbgra8)
3546 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3551 void DPSOFTRAST_VertexShader_LightDirection(void);
3552 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3554 static void DPSOFTRAST_VertexShader_FakeLight(void)
3556 DPSOFTRAST_VertexShader_LightDirection();
3559 static void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3561 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3566 static void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3568 DPSOFTRAST_VertexShader_LightDirection();
3569 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3572 static void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3574 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3579 static void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3581 DPSOFTRAST_VertexShader_LightDirection();
3582 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3585 static void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3587 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3592 void DPSOFTRAST_VertexShader_LightDirection(void)
3595 int numvertices = dpsoftrast.numvertices;
3597 float LightVector[4];
3598 float EyePosition[4];
3599 float EyeVectorModelSpace[4];
3605 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3606 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3607 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3608 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3609 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3610 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3611 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3612 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3613 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3614 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3615 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3616 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3617 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3618 for (i = 0;i < numvertices;i++)
3620 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3621 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3622 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3623 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3624 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3625 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3626 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3627 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3628 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3629 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3630 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3631 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3632 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3633 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3634 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3635 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3636 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3637 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3638 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3639 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3640 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3641 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3642 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3643 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3644 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3645 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3646 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3647 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3648 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3650 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3653 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3654 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3655 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3656 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3657 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3658 #define DPSOFTRAST_Vector3Normalize(v)\
3661 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3672 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3674 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3675 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3676 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3677 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3678 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3679 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3680 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3681 unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3682 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3683 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3684 int x, startx = span->startx, endx = span->endx;
3685 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3686 float LightVectordata[4];
3687 float LightVectorslope[4];
3688 float EyeVectordata[4];
3689 float EyeVectorslope[4];
3690 float VectorSdata[4];
3691 float VectorSslope[4];
3692 float VectorTdata[4];
3693 float VectorTslope[4];
3694 float VectorRdata[4];
3695 float VectorRslope[4];
3697 float diffusetex[4];
3699 float surfacenormal[4];
3700 float lightnormal[4];
3701 float lightnormal_modelspace[4];
3703 float specularnormal[4];
3706 float SpecularPower;
3708 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3709 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3710 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3711 Color_Glow[3] = 0.0f;
3712 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3713 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3714 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3715 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3716 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3717 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3718 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3719 Color_Pants[3] = 0.0f;
3720 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3721 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3722 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3723 Color_Shirt[3] = 0.0f;
3724 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3725 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3726 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3728 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3729 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3731 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3733 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3735 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3737 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3738 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3739 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3740 Color_Diffuse[3] = 0.0f;
3741 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3742 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3743 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3744 LightColor[3] = 0.0f;
3745 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3746 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3747 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3748 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3749 Color_Specular[3] = 0.0f;
3750 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3751 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3752 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3754 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3756 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3757 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3758 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3759 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3760 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3762 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3764 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3765 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3767 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3769 // nothing of this needed
3773 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3776 for (x = startx;x < endx;x++)
3779 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3780 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3781 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3782 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3783 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3785 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3786 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3787 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3788 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3790 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3791 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3792 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3793 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3794 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3795 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3796 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3797 DPSOFTRAST_Vector3Normalize(surfacenormal);
3799 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3801 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3802 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3803 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3804 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3806 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3807 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3808 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3809 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3811 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3812 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3813 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3814 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3816 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3817 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3818 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3819 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3821 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3822 DPSOFTRAST_Vector3Normalize(lightnormal);
3824 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3826 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3827 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3828 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3829 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3832 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3834 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3835 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3836 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3838 float f = 1.0f / 256.0f;
3839 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3840 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3841 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3844 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3846 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3847 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3848 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3849 DPSOFTRAST_Vector3Normalize(lightnormal);
3851 LightColor[0] = 1.0;
3852 LightColor[1] = 1.0;
3853 LightColor[2] = 1.0;
3857 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3858 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3859 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3860 DPSOFTRAST_Vector3Normalize(lightnormal);
3863 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3865 if(thread->shader_exactspecularmath)
3867 // reflect lightnormal at surfacenormal, take the negative of that
3868 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3870 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3871 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3872 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3873 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3875 // dot of this and normalize(EyeVectorFogDepth.xyz)
3876 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3877 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3878 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3879 DPSOFTRAST_Vector3Normalize(eyenormal);
3881 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3885 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3886 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3887 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3888 DPSOFTRAST_Vector3Normalize(eyenormal);
3890 specularnormal[0] = lightnormal[0] + eyenormal[0];
3891 specularnormal[1] = lightnormal[1] + eyenormal[1];
3892 specularnormal[2] = lightnormal[2] + eyenormal[2];
3893 DPSOFTRAST_Vector3Normalize(specularnormal);
3895 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3897 specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
3899 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3901 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3902 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3903 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3904 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3908 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3909 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3910 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3911 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3914 buffer_FragColorbgra8[x*4+0] = d[0];
3915 buffer_FragColorbgra8[x*4+1] = d[1];
3916 buffer_FragColorbgra8[x*4+2] = d[2];
3917 buffer_FragColorbgra8[x*4+3] = d[3];
3920 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3922 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3923 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3924 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3925 Color_Diffuse[3] = 0.0f;
3926 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3927 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3928 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3929 LightColor[3] = 0.0f;
3930 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3932 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3934 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3935 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3936 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3937 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3938 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3940 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3942 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3943 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3945 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3947 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3951 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3954 for (x = startx;x < endx;x++)
3957 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3958 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3959 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3960 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3961 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3962 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3963 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3964 DPSOFTRAST_Vector3Normalize(surfacenormal);
3966 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3968 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3969 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3970 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3971 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3973 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3974 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3975 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3976 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3978 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3979 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3980 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3981 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3983 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3984 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3985 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3986 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3988 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3989 DPSOFTRAST_Vector3Normalize(lightnormal);
3991 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3993 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3994 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3995 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3996 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3999 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
4001 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4002 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4003 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4005 float f = 1.0f / 256.0f;
4006 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
4007 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
4008 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
4011 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
4013 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4014 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4015 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4016 DPSOFTRAST_Vector3Normalize(lightnormal);
4018 LightColor[0] = 1.0;
4019 LightColor[1] = 1.0;
4020 LightColor[2] = 1.0;
4024 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4025 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4026 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4027 DPSOFTRAST_Vector3Normalize(lightnormal);
4030 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4031 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4033 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4034 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4035 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4036 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4040 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4041 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4042 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4043 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4045 buffer_FragColorbgra8[x*4+0] = d[0];
4046 buffer_FragColorbgra8[x*4+1] = d[1];
4047 buffer_FragColorbgra8[x*4+2] = d[2];
4048 buffer_FragColorbgra8[x*4+3] = d[3];
4053 for (x = startx;x < endx;x++)
4056 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4057 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4058 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4059 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4061 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4063 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4064 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4065 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4066 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4070 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4071 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4072 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4073 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4075 buffer_FragColorbgra8[x*4+0] = d[0];
4076 buffer_FragColorbgra8[x*4+1] = d[1];
4077 buffer_FragColorbgra8[x*4+2] = d[2];
4078 buffer_FragColorbgra8[x*4+3] = d[3];
4081 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4086 static void DPSOFTRAST_VertexShader_LightSource(void)
4089 int numvertices = dpsoftrast.numvertices;
4090 float LightPosition[4];
4091 float LightVector[4];
4092 float LightVectorModelSpace[4];
4093 float EyePosition[4];
4094 float EyeVectorModelSpace[4];
4100 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4101 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4102 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4103 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4104 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4105 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4106 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4107 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4108 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4109 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4110 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4111 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4112 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4113 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4114 for (i = 0;i < numvertices;i++)
4116 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4117 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4118 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4119 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4120 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4121 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4122 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4123 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4124 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4125 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4126 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4127 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4128 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4129 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4130 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4131 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4132 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4133 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
4134 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4135 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4136 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4137 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4138 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4139 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4140 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4141 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4142 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4143 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4144 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4145 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4146 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4147 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4149 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4150 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4153 static void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4156 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4157 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4158 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4159 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4160 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4161 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4162 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4163 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4164 int x, startx = span->startx, endx = span->endx;
4165 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4166 float CubeVectordata[4];
4167 float CubeVectorslope[4];
4168 float LightVectordata[4];
4169 float LightVectorslope[4];
4170 float EyeVectordata[4];
4171 float EyeVectorslope[4];
4173 float diffusetex[4];
4175 float surfacenormal[4];
4176 float lightnormal[4];
4178 float specularnormal[4];
4181 float SpecularPower;
4182 float CubeVector[4];
4185 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4186 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4187 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4188 Color_Glow[3] = 0.0f;
4189 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4190 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4191 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4192 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4193 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4194 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4195 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4196 Color_Diffuse[3] = 0.0f;
4197 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4198 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4199 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4200 Color_Specular[3] = 0.0f;
4201 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4202 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4203 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4204 Color_Pants[3] = 0.0f;
4205 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4206 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4207 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4208 Color_Shirt[3] = 0.0f;
4209 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4210 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4211 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4212 LightColor[3] = 0.0f;
4213 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4214 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4215 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4216 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4217 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4218 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4219 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4220 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4222 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4223 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4225 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4226 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4227 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4229 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4230 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4231 for (x = startx;x < endx;x++)
4234 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4235 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4236 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4237 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4238 if (attenuation < 0.01f)
4240 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4242 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4243 if (attenuation < 0.01f)
4247 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4248 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4249 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4250 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4251 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4253 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4254 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4255 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4256 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4258 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4259 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4260 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4261 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4262 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4263 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4264 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4265 DPSOFTRAST_Vector3Normalize(surfacenormal);
4267 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4268 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4269 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4270 DPSOFTRAST_Vector3Normalize(lightnormal);
4272 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4274 if(thread->shader_exactspecularmath)
4276 // reflect lightnormal at surfacenormal, take the negative of that
4277 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4279 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4280 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4281 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4282 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4284 // dot of this and normalize(EyeVectorFogDepth.xyz)
4285 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4286 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4287 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4288 DPSOFTRAST_Vector3Normalize(eyenormal);
4290 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4294 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4295 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4296 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4297 DPSOFTRAST_Vector3Normalize(eyenormal);
4299 specularnormal[0] = lightnormal[0] + eyenormal[0];
4300 specularnormal[1] = lightnormal[1] + eyenormal[1];
4301 specularnormal[2] = lightnormal[2] + eyenormal[2];
4302 DPSOFTRAST_Vector3Normalize(specularnormal);
4304 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4306 specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
4308 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4310 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4311 attenuation *= (1.0f / 255.0f);
4312 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4313 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4314 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4315 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4319 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4320 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4321 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4322 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4324 buffer_FragColorbgra8[x*4+0] = d[0];
4325 buffer_FragColorbgra8[x*4+1] = d[1];
4326 buffer_FragColorbgra8[x*4+2] = d[2];
4327 buffer_FragColorbgra8[x*4+3] = d[3];
4330 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4332 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4333 for (x = startx;x < endx;x++)
4336 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4337 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4338 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4339 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4340 if (attenuation < 0.01f)
4342 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4344 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4345 if (attenuation < 0.01f)
4349 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4350 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4351 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4352 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4353 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4355 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4356 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4357 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4358 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4360 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4361 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4362 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4363 DPSOFTRAST_Vector3Normalize(surfacenormal);
4365 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4366 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4367 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4368 DPSOFTRAST_Vector3Normalize(lightnormal);
4370 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4371 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4373 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4374 attenuation *= (1.0f / 255.0f);
4375 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4376 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4377 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4378 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4382 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4383 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4384 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4385 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4387 buffer_FragColorbgra8[x*4+0] = d[0];
4388 buffer_FragColorbgra8[x*4+1] = d[1];
4389 buffer_FragColorbgra8[x*4+2] = d[2];
4390 buffer_FragColorbgra8[x*4+3] = d[3];
4395 for (x = startx;x < endx;x++)
4398 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4399 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4400 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4401 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4402 if (attenuation < 0.01f)
4404 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4406 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4407 if (attenuation < 0.01f)
4411 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4412 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4413 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4414 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4415 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4417 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4418 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4419 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4420 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4422 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4424 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4425 attenuation *= (1.0f / 255.0f);
4426 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4427 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4428 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4429 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4433 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4434 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4435 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4436 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4438 buffer_FragColorbgra8[x*4+0] = d[0];
4439 buffer_FragColorbgra8[x*4+1] = d[1];
4440 buffer_FragColorbgra8[x*4+2] = d[2];
4441 buffer_FragColorbgra8[x*4+3] = d[3];
4444 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4450 static void DPSOFTRAST_VertexShader_Refraction(void)
4452 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4453 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4454 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4457 static void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4459 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4461 int x, startx = span->startx, endx = span->endx;
4464 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4465 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4468 float ModelViewProjectionPositiondata[4];
4469 float ModelViewProjectionPositionslope[4];
4472 float ScreenScaleRefractReflect[2];
4473 float ScreenCenterRefractReflect[2];
4474 float DistortScaleRefractReflect[2];
4475 float RefractColor[4];
4477 DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4478 if(!texture) return;
4481 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4482 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4485 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4488 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4489 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4490 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4491 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4492 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4493 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4494 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4495 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4496 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4497 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4500 for (x = startx;x < endx;x++)
4502 float SafeScreenTexCoord[2];
4503 float ScreenTexCoord[2];
4510 // " vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4511 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4513 // " vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4514 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4515 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4517 // " vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4518 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4519 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4520 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4521 DPSOFTRAST_Vector3Normalize(v);
4522 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4523 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4525 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4526 DPSOFTRAST_Texture2DBGRA8(texture, 0, ScreenTexCoord[0], ScreenTexCoord[1], c);
4528 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4529 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4530 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4531 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4534 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4539 static void DPSOFTRAST_VertexShader_Water(void)
4542 int numvertices = dpsoftrast.numvertices;
4543 float EyePosition[4];
4544 float EyeVectorModelSpace[4];
4550 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4551 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4552 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4553 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4554 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4555 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4556 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4557 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4558 for (i = 0;i < numvertices;i++)
4560 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4561 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4562 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4563 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4564 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4565 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4566 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4567 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4568 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4569 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4570 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4571 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4572 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4573 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4574 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4575 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4576 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4577 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4578 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
4579 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
4580 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
4581 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
4583 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4584 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4585 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4589 static void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4591 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4593 int x, startx = span->startx, endx = span->endx;
4596 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4597 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4600 float ModelViewProjectionPositiondata[4];
4601 float ModelViewProjectionPositionslope[4];
4602 float EyeVectordata[4];
4603 float EyeVectorslope[4];
4606 float ScreenScaleRefractReflect[4];
4607 float ScreenCenterRefractReflect[4];
4608 float DistortScaleRefractReflect[4];
4609 float RefractColor[4];
4610 float ReflectColor[4];
4611 float ReflectFactor;
4612 float ReflectOffset;
4614 DPSOFTRAST_Texture *texture_refraction = thread->texbound[GL20TU_REFRACTION];
4615 DPSOFTRAST_Texture *texture_reflection = thread->texbound[GL20TU_REFLECTION];
4616 if(!texture_refraction || !texture_reflection) return;
4619 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4620 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4623 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4624 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
4627 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4628 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4629 ScreenScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+2];
4630 ScreenScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+3];
4631 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4632 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4633 ScreenCenterRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+2];
4634 ScreenCenterRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+3];
4635 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4636 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4637 DistortScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+2];
4638 DistortScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+3];
4639 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4640 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4641 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4642 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4643 ReflectColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+2];
4644 ReflectColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+1];
4645 ReflectColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+0];
4646 ReflectColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+3];
4647 ReflectFactor = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectFactor*4+0];
4648 ReflectOffset = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectOffset*4+0];
4651 for (x = startx;x < endx;x++)
4653 float SafeScreenTexCoord[4];
4654 float ScreenTexCoord[4];
4657 unsigned char c1[4];
4658 unsigned char c2[4];
4663 // " vec4 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect * (1.0 / ModelViewProjectionPosition.w);\n"
4664 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4666 // " vec4 SafeScreenTexCoord = ModelViewProjectionPosition.xyxy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect;\n"
4667 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4668 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4669 SafeScreenTexCoord[2] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[2] + ScreenCenterRefractReflect[2]; // * z (disappears)
4670 SafeScreenTexCoord[3] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[3] + ScreenCenterRefractReflect[3]; // * z (disappears)
4672 // " vec4 ScreenTexCoord = SafeScreenTexCoord + vec2(normalize(vec3(dp_texture2D(Texture_Normal, TexCoord)) - vec3(0.5))).xyxy * DistortScaleRefractReflect;\n"
4673 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4674 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4675 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4676 DPSOFTRAST_Vector3Normalize(v);
4677 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4678 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4679 ScreenTexCoord[2] = SafeScreenTexCoord[2] + v[0] * DistortScaleRefractReflect[2];
4680 ScreenTexCoord[3] = SafeScreenTexCoord[3] + v[1] * DistortScaleRefractReflect[3];
4682 // " float Fresnel = pow(min(1.0, 1.0 - float(normalize(EyeVector).z)), 2.0) * ReflectFactor + ReflectOffset;\n"
4683 v[0] = (EyeVectordata[0] + EyeVectorslope[0] * x); // * z (disappears)
4684 v[1] = (EyeVectordata[1] + EyeVectorslope[1] * x); // * z (disappears)
4685 v[2] = (EyeVectordata[2] + EyeVectorslope[2] * x); // * z (disappears)
4686 DPSOFTRAST_Vector3Normalize(v);
4687 Fresnel = 1.0f - v[2];
4688 Fresnel = min(1.0f, Fresnel);
4689 Fresnel = Fresnel * Fresnel * ReflectFactor + ReflectOffset;
4691 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4692 // " dp_FragColor = mix(vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord.xy).rgb, 1) * RefractColor, vec4(dp_texture2D(Texture_Reflection, ScreenTexCoord.zw).rgb, 1) * ReflectColor, Fresnel);\n"
4693 DPSOFTRAST_Texture2DBGRA8(texture_refraction, 0, ScreenTexCoord[0], ScreenTexCoord[1], c1);
4694 DPSOFTRAST_Texture2DBGRA8(texture_reflection, 0, ScreenTexCoord[2], ScreenTexCoord[3], c2);
4696 buffer_FragColorbgra8[x*4+0] = (c1[0] * RefractColor[0]) * (1.0f - Fresnel) + (c2[0] * ReflectColor[0]) * Fresnel;
4697 buffer_FragColorbgra8[x*4+1] = (c1[1] * RefractColor[1]) * (1.0f - Fresnel) + (c2[1] * ReflectColor[1]) * Fresnel;
4698 buffer_FragColorbgra8[x*4+2] = (c1[2] * RefractColor[2]) * (1.0f - Fresnel) + (c2[2] * ReflectColor[2]) * Fresnel;
4699 buffer_FragColorbgra8[x*4+3] = min(( RefractColor[3] * (1.0f - Fresnel) + ReflectColor[3] * Fresnel) * 256, 255);
4702 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4707 static void DPSOFTRAST_VertexShader_ShowDepth(void)
4709 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4712 static void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4715 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4716 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4717 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4718 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4719 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4724 static void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4726 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4729 static void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4732 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4733 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4734 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4735 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4736 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4741 static void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4743 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4746 static void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4749 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4750 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4751 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4752 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4753 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4758 typedef struct DPSOFTRAST_ShaderModeInfo_s
4761 void (*Vertex)(void);
4762 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4763 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4764 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4766 DPSOFTRAST_ShaderModeInfo;
4768 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4770 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4771 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4772 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4773 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4774 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4775 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4776 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4777 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4778 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4779 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4780 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4781 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4782 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4783 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4784 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_NORMAL, GL20TU_REFLECTION, GL20TU_REFRACTION, ~0}},
4785 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4786 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4787 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4790 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4795 unsigned int *depthpixel;
4799 unsigned char *pixelmask;
4800 DPSOFTRAST_State_Triangle *triangle;
4801 triangle = &thread->triangles[span->triangle];
4802 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4803 startx = span->startx;
4805 depth = span->depthbase;
4806 depthslope = span->depthslope;
4807 pixelmask = thread->pixelmaskarray;
4808 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4810 switch(thread->fb_depthfunc)
4813 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4814 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4815 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4816 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4817 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4818 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4819 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4821 while (startx < endx && !pixelmask[startx])
4823 while (endx > startx && !pixelmask[endx-1])
4828 // no depth testing means we're just dealing with color...
4829 memset(pixelmask + startx, 1, endx - startx);
4831 span->pixelmask = pixelmask;
4832 span->startx = startx;
4836 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4838 int x, d, depth, depthslope, startx, endx;
4839 const unsigned char *pixelmask;
4840 unsigned int *depthpixel;
4841 if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4843 depth = span->depthbase;
4844 depthslope = span->depthslope;
4845 pixelmask = span->pixelmask;
4846 startx = span->startx;
4848 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4849 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4855 static void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4858 DPSOFTRAST_State_Triangle *triangle;
4859 DPSOFTRAST_State_Span *span;
4860 for (i = 0; i < thread->numspans; i++)
4862 span = &thread->spans[i];
4863 triangle = &thread->triangles[span->triangle];
4864 DPSOFTRAST_Draw_DepthTest(thread, span);
4865 if (span->startx >= span->endx)
4867 // run pixel shader if appropriate
4868 // do this before running depthmask code, to allow the pixelshader
4869 // to clear pixelmask values for alpha testing
4870 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4871 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4872 DPSOFTRAST_Draw_DepthWrite(thread, span);
4874 thread->numspans = 0;
4877 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;)
4879 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4882 int cullface = thread->cullface;
4883 int minx, maxx, miny, maxy;
4884 int miny1, maxy1, miny2, maxy2;
4885 __m128i fbmin, fbmax;
4886 __m128 viewportcenter, viewportscale;
4887 int firstvertex = command->firstvertex;
4888 int numvertices = command->numvertices;
4889 int numtriangles = command->numtriangles;
4890 const int *element3i = command->element3i;
4891 const unsigned short *element3s = command->element3s;
4892 int clipped = command->clipped;
4899 int starty, endy, bandy;
4903 float clip0origin, clip0slope;
4905 __m128 triangleedge1, triangleedge2, trianglenormal;
4908 DPSOFTRAST_State_Triangle *triangle;
4909 DPSOFTRAST_Texture *texture;
4910 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4911 miny = thread->fb_scissor[1];
4912 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4913 miny1 = bound(miny, thread->miny1, maxy);
4914 maxy1 = bound(miny, thread->maxy1, maxy);
4915 miny2 = bound(miny, thread->miny2, maxy);
4916 maxy2 = bound(miny, thread->maxy2, maxy);
4917 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4919 if (!ATOMIC_DECREMENT(command->refcount))
4921 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4922 MM_FREE(command->arrays);
4926 minx = thread->fb_scissor[0];
4927 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4928 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4929 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4930 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4931 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4932 screen[3] = _mm_setzero_ps();
4933 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4934 for (i = 0;i < numtriangles;i++)
4936 const float *screencoord4f = command->arrays;
4937 const float *arrays = screencoord4f + numvertices*4;
4939 // generate the 3 edges of this triangle
4940 // generate spans for the triangle - switch based on left split or right split classification of triangle
4943 e[0] = element3s[i*3+0] - firstvertex;
4944 e[1] = element3s[i*3+1] - firstvertex;
4945 e[2] = element3s[i*3+2] - firstvertex;
4949 e[0] = element3i[i*3+0] - firstvertex;
4950 e[1] = element3i[i*3+1] - firstvertex;
4951 e[2] = element3i[i*3+2] - firstvertex;
4960 #define SKIPBACKFACE \
4961 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4962 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4963 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4964 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4965 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4969 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4973 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4978 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4979 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4981 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4982 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4984 #define CLIPPEDVERTEXCOPY(k,p1) \
4985 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4987 #define GENATTRIBCOPY(attrib, p1) \
4988 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4989 #define GENATTRIBLERP(attrib, p1, p2) \
4991 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4992 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4994 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4998 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4999 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
5000 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
5001 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
5002 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
5003 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
5004 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
5010 // calculate distance from nearplane
5011 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
5012 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
5013 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
5014 if (clipdist[0] >= 0.0f)
5016 if (clipdist[1] >= 0.0f)
5018 if (clipdist[2] >= 0.0f)
5021 // triangle is entirely in front of nearplane
5022 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
5029 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
5037 if (clipdist[2] >= 0.0f)
5039 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
5046 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
5053 else if (clipdist[1] >= 0.0f)
5055 if (clipdist[2] >= 0.0f)
5057 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
5064 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
5070 else if (clipdist[2] >= 0.0f)
5072 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
5077 else continue; // triangle is entirely behind nearplane
5080 // calculate integer y coords for triangle points
5081 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
5082 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
5083 screenmin = _mm_min_epi16(screeni, screenir),
5084 screenmax = _mm_max_epi16(screeni, screenir);
5085 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
5086 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
5087 screenmin = _mm_max_epi16(screenmin, fbmin);
5088 screenmax = _mm_min_epi16(screenmax, fbmax);
5089 // skip offscreen triangles
5090 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5092 starty = _mm_extract_epi16(screenmin, 1);
5093 endy = _mm_extract_epi16(screenmax, 1)+1;
5094 if (starty >= maxy1 && endy <= miny2)
5096 screeny = _mm_srai_epi32(screeni, 16);
5099 triangle = &thread->triangles[thread->numtriangles];
5101 // calculate attribute plans for triangle data...
5102 // okay, this triangle is going to produce spans, we'd better project
5103 // the interpolants now (this is what gives perspective texturing),
5104 // this consists of simply multiplying all arrays by the W coord
5105 // (which is basically 1/Z), which will be undone per-pixel
5106 // (multiplying by Z again) to get the perspective-correct array
5109 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5110 __m128 mipedgescale, mipdensity;
5111 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5112 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5113 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5114 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5115 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5116 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5117 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5118 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5119 attribedge1 = _mm_sub_ss(w0, w1);
5120 attribedge2 = _mm_sub_ss(w2, w1);
5121 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5122 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5123 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5124 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5125 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5126 _mm_store_ss(&triangle->w[0], attribxslope);
5127 _mm_store_ss(&triangle->w[1], attribyslope);
5128 _mm_store_ss(&triangle->w[2], attriborigin);
5133 if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
5135 float cliporigin, clipxslope, clipyslope;
5136 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5137 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5138 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5139 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5140 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5141 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5142 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
5143 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
5144 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
5147 clip0origin = -cliporigin/clipxslope;
5148 clip0slope = -clipyslope/clipxslope;
5149 clip0dir = clipxslope > 0 ? 1 : -1;
5151 else if(clipyslope > 0)
5153 clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5154 clip0slope = dpsoftrast.fb_width;
5157 else if(clipyslope < 0)
5159 clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5160 clip0slope = -dpsoftrast.fb_width;
5163 else if(clip0origin < 0) continue;
5166 mipedgescale = _mm_setzero_ps();
5167 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5169 __m128 attrib0, attrib1, attrib2;
5170 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5171 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5173 arrays += numvertices*4;
5174 GENATTRIBS(attrib0, attrib1, attrib2);
5175 attriborigin = _mm_mul_ps(attrib1, w1);
5176 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5177 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5178 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5179 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5180 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5181 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5182 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5183 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5184 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5186 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5187 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5188 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5189 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5193 memset(triangle->mip, 0, sizeof(triangle->mip));
5194 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5196 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5197 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5199 texture = thread->texbound[texunit];
5200 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5202 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5203 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5204 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5205 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5206 // this will be multiplied in the texturing routine by the texture resolution
5207 y = _mm_cvtss_si32(mipdensity);
5210 y = (int)(log((float)y)*0.5f/M_LN2);
5211 if (y > texture->mipmaps - 1)
5212 y = texture->mipmaps - 1;
5213 triangle->mip[texunit] = y;
5219 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5222 __m128 xcoords, xslope;
5223 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5224 int yccmask = _mm_movemask_epi8(ycc);
5225 int edge0p, edge0n, edge1p, edge1n;
5234 case 0xFFFF: /*0000*/ y = endy; continue;
5235 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5236 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5237 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5238 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5239 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5240 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5241 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5242 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5243 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5244 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5245 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5246 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5247 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5248 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5249 case 0x0000: /*1111*/ y++; continue;
5257 case 0xFFFF: /*000*/ y = endy; continue;
5258 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5259 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5260 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5261 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5262 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5263 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5264 case 0x0000: /*111*/ y++; continue;
5267 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5268 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5269 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5270 nexty = _mm_extract_epi16(ycc, 0);
5271 if (nexty >= bandy) nexty = bandy-1;
5272 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5273 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5274 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5275 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5276 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5277 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5279 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5280 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5282 clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5283 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5285 int startx, endx, offset;
5286 startx = _mm_cvtss_si32(xcoords);
5287 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5288 if (startx < minx) startx = minx;
5289 if (endx > maxx) endx = maxx;
5290 if (startx >= endx) continue;
5298 if(endx <= clip0) continue;
5299 startx = (int)clip0;
5302 else if (endx > clip0)
5304 if(startx >= clip0) continue;
5309 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5311 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5312 span->triangle = thread->numtriangles;
5316 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5317 if (span->startx >= span->endx)
5319 wslope = triangle->w[0];
5320 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5321 span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5322 span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5323 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5324 DPSOFTRAST_Draw_ProcessSpans(thread);
5329 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5331 DPSOFTRAST_Draw_ProcessSpans(thread);
5332 thread->numtriangles = 0;
5336 if (!ATOMIC_DECREMENT(command->refcount))
5338 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5339 MM_FREE(command->arrays);
5342 if (thread->numspans > 0 || thread->numtriangles > 0)
5344 DPSOFTRAST_Draw_ProcessSpans(thread);
5345 thread->numtriangles = 0;
5350 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5354 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5355 int datasize = 2*numvertices*sizeof(float[4]);
5356 DPSOFTRAST_Command_Draw *command;
5357 unsigned char *data;
5358 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5360 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5361 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5363 datasize += numvertices*sizeof(float[4]);
5366 datasize += numtriangles*sizeof(unsigned short[3]);
5368 datasize += numtriangles*sizeof(int[3]);
5369 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5370 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5372 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5373 data = (unsigned char *)MM_CALLOC(datasize, 1);
5377 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5378 data = (unsigned char *)command + commandsize;
5380 command->firstvertex = firstvertex;
5381 command->numvertices = numvertices;
5382 command->numtriangles = numtriangles;
5383 command->arrays = (float *)data;
5384 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5385 dpsoftrast.firstvertex = firstvertex;
5386 dpsoftrast.numvertices = numvertices;
5387 dpsoftrast.screencoord4f = (float *)data;
5388 data += numvertices*sizeof(float[4]);
5389 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5390 data += numvertices*sizeof(float[4]);
5391 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5393 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5394 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5396 dpsoftrast.post_array4f[j] = (float *)data;
5397 data += numvertices*sizeof(float[4]);
5399 command->element3i = NULL;
5400 command->element3s = NULL;
5403 command->element3s = (unsigned short *)data;
5404 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5408 command->element3i = (int *)data;
5409 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5414 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5416 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5417 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5418 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5419 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5420 if (command->starty >= command->endy)
5422 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5423 MM_FREE(command->arrays);
5424 DPSOFTRAST_UndoCommand(command->commandsize);
5427 command->clipped = dpsoftrast.drawclipped;
5428 command->refcount = dpsoftrast.numthreads;
5430 if (dpsoftrast.usethreads)
5433 DPSOFTRAST_Draw_SyncCommands();
5434 for (i = 0; i < dpsoftrast.numthreads; i++)
5436 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5437 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5438 Thread_CondSignal(thread->drawcond);
5443 DPSOFTRAST_Draw_FlushThreads();
5447 DEFCOMMAND(23, SetRenderTargets, int width; int height;)
5448 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5450 thread->validate |= DPSOFTRAST_VALIDATE_FB;
5452 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5454 DPSOFTRAST_Command_SetRenderTargets *command;
5455 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5456 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5457 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5459 dpsoftrast.fb_width = width;
5460 dpsoftrast.fb_height = height;
5461 dpsoftrast.fb_depthpixels = depthpixels;
5462 dpsoftrast.fb_colorpixels[0] = colorpixels0;
5463 dpsoftrast.fb_colorpixels[1] = colorpixels1;
5464 dpsoftrast.fb_colorpixels[2] = colorpixels2;
5465 dpsoftrast.fb_colorpixels[3] = colorpixels3;
5466 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5467 command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5468 command->width = width;
5469 command->height = height;
5472 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5474 int commandoffset = thread->commandoffset;
5475 while (commandoffset != endoffset)
5477 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5478 switch (command->opcode)
5480 #define INTERPCOMMAND(name) \
5481 case DPSOFTRAST_OPCODE_##name : \
5482 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5483 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5484 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5485 commandoffset = 0; \
5487 INTERPCOMMAND(Viewport)
5488 INTERPCOMMAND(ClearColor)
5489 INTERPCOMMAND(ClearDepth)
5490 INTERPCOMMAND(ColorMask)
5491 INTERPCOMMAND(DepthTest)
5492 INTERPCOMMAND(ScissorTest)
5493 INTERPCOMMAND(Scissor)
5494 INTERPCOMMAND(BlendFunc)
5495 INTERPCOMMAND(BlendSubtract)
5496 INTERPCOMMAND(DepthMask)
5497 INTERPCOMMAND(DepthFunc)
5498 INTERPCOMMAND(DepthRange)
5499 INTERPCOMMAND(PolygonOffset)
5500 INTERPCOMMAND(CullFace)
5501 INTERPCOMMAND(SetTexture)
5502 INTERPCOMMAND(SetShader)
5503 INTERPCOMMAND(Uniform4f)
5504 INTERPCOMMAND(UniformMatrix4f)
5505 INTERPCOMMAND(Uniform1i)
5506 INTERPCOMMAND(SetRenderTargets)
5507 INTERPCOMMAND(ClipPlane)
5509 case DPSOFTRAST_OPCODE_Draw:
5510 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5511 commandoffset += command->commandsize;
5512 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5514 thread->commandoffset = commandoffset;
5517 case DPSOFTRAST_OPCODE_Reset:
5522 thread->commandoffset = commandoffset;
5525 static int DPSOFTRAST_Draw_Thread(void *data)
5527 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5528 while(thread->index >= 0)
5530 if (thread->commandoffset != dpsoftrast.drawcommand)
5532 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5536 Thread_LockMutex(thread->drawmutex);
5537 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5539 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5540 thread->starving = true;
5541 Thread_CondWait(thread->drawcond, thread->drawmutex);
5542 thread->starving = false;
5544 Thread_UnlockMutex(thread->drawmutex);
5550 static void DPSOFTRAST_Draw_FlushThreads(void)
5552 DPSOFTRAST_State_Thread *thread;
5554 DPSOFTRAST_Draw_SyncCommands();
5555 if (dpsoftrast.usethreads)
5557 for (i = 0; i < dpsoftrast.numthreads; i++)
5559 thread = &dpsoftrast.threads[i];
5560 if (thread->commandoffset != dpsoftrast.drawcommand)
5562 Thread_LockMutex(thread->drawmutex);
5563 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5564 Thread_CondSignal(thread->drawcond);
5565 Thread_UnlockMutex(thread->drawmutex);
5568 for (i = 0; i < dpsoftrast.numthreads; i++)
5570 thread = &dpsoftrast.threads[i];
5571 if (thread->commandoffset != dpsoftrast.drawcommand)
5573 Thread_LockMutex(thread->drawmutex);
5574 if (thread->commandoffset != dpsoftrast.drawcommand)
5576 thread->waiting = true;
5577 Thread_CondWait(thread->waitcond, thread->drawmutex);
5578 thread->waiting = false;
5580 Thread_UnlockMutex(thread->drawmutex);
5586 for (i = 0; i < dpsoftrast.numthreads; i++)
5588 thread = &dpsoftrast.threads[i];
5589 if (thread->commandoffset != dpsoftrast.drawcommand)
5590 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5593 dpsoftrast.commandpool.usedcommands = 0;
5596 void DPSOFTRAST_Flush(void)
5598 DPSOFTRAST_Draw_FlushThreads();
5601 void DPSOFTRAST_Finish(void)
5606 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5616 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5617 dpsoftrast.bigendian = u.b[3];
5618 dpsoftrast.fb_width = width;
5619 dpsoftrast.fb_height = height;
5620 dpsoftrast.fb_depthpixels = depthpixels;
5621 dpsoftrast.fb_colorpixels[0] = colorpixels;
5622 dpsoftrast.fb_colorpixels[1] = NULL;
5623 dpsoftrast.fb_colorpixels[1] = NULL;
5624 dpsoftrast.fb_colorpixels[1] = NULL;
5625 dpsoftrast.viewport[0] = 0;
5626 dpsoftrast.viewport[1] = 0;
5627 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5628 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5629 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5630 dpsoftrast.texture_firstfree = 1;
5631 dpsoftrast.texture_end = 1;
5632 dpsoftrast.texture_max = 0;
5633 dpsoftrast.color[0] = 1;
5634 dpsoftrast.color[1] = 1;
5635 dpsoftrast.color[2] = 1;
5636 dpsoftrast.color[3] = 1;
5637 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5638 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5639 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5640 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5641 for (i = 0; i < dpsoftrast.numthreads; i++)
5643 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5645 thread->cullface = GL_BACK;
5646 thread->colormask[0] = 1;
5647 thread->colormask[1] = 1;
5648 thread->colormask[2] = 1;
5649 thread->colormask[3] = 1;
5650 thread->blendfunc[0] = GL_ONE;
5651 thread->blendfunc[1] = GL_ZERO;
5652 thread->depthmask = true;
5653 thread->depthtest = true;
5654 thread->depthfunc = GL_LEQUAL;
5655 thread->scissortest = false;
5656 thread->viewport[0] = 0;
5657 thread->viewport[1] = 0;
5658 thread->viewport[2] = dpsoftrast.fb_width;
5659 thread->viewport[3] = dpsoftrast.fb_height;
5660 thread->scissor[0] = 0;
5661 thread->scissor[1] = 0;
5662 thread->scissor[2] = dpsoftrast.fb_width;
5663 thread->scissor[3] = dpsoftrast.fb_height;
5664 thread->depthrange[0] = 0;
5665 thread->depthrange[1] = 1;
5666 thread->polygonoffset[0] = 0;
5667 thread->polygonoffset[1] = 0;
5668 thread->clipplane[0] = 0;
5669 thread->clipplane[1] = 0;
5670 thread->clipplane[2] = 0;
5671 thread->clipplane[3] = 1;
5673 thread->numspans = 0;
5674 thread->numtriangles = 0;
5675 thread->commandoffset = 0;
5676 thread->waiting = false;
5677 thread->starving = false;
5679 thread->validate = -1;
5680 DPSOFTRAST_Validate(thread, -1);
5682 if (dpsoftrast.usethreads)
5684 thread->waitcond = Thread_CreateCond();
5685 thread->drawcond = Thread_CreateCond();
5686 thread->drawmutex = Thread_CreateMutex();
5687 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5693 void DPSOFTRAST_Shutdown(void)
5696 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5698 DPSOFTRAST_State_Thread *thread;
5699 for (i = 0; i < dpsoftrast.numthreads; i++)
5701 thread = &dpsoftrast.threads[i];
5702 Thread_LockMutex(thread->drawmutex);
5704 Thread_CondSignal(thread->drawcond);
5705 Thread_UnlockMutex(thread->drawmutex);
5706 Thread_WaitThread(thread->thread, 0);
5707 Thread_DestroyCond(thread->waitcond);
5708 Thread_DestroyCond(thread->drawcond);
5709 Thread_DestroyMutex(thread->drawmutex);
5712 for (i = 0;i < dpsoftrast.texture_end;i++)
5713 if (dpsoftrast.texture[i].bytes)
5714 MM_FREE(dpsoftrast.texture[i].bytes);
5715 if (dpsoftrast.texture)
5716 free(dpsoftrast.texture);
5717 if (dpsoftrast.threads)
5718 MM_FREE(dpsoftrast.threads);
5719 memset(&dpsoftrast, 0, sizeof(dpsoftrast));