3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 #pragma warning(disable : 4324)
14 typedef qboolean bool;
21 #if defined(__APPLE__)
22 #include <libkern/OSAtomic.h>
23 #define ALIGN(var) var __attribute__((__aligned__(16)))
24 #define ATOMIC(var) var __attribute__((__aligned__(4)))
25 #define MEMORY_BARRIER (_mm_sfence())
26 #define ATOMIC_COUNTER volatile int32_t
27 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30 #elif defined(__GNUC__) && defined(WIN32)
31 #define ALIGN(var) var __attribute__((__aligned__(16)))
32 #define ATOMIC(var) var __attribute__((__aligned__(4)))
33 #define MEMORY_BARRIER (_mm_sfence())
34 //(__sync_synchronize())
35 #define ATOMIC_COUNTER volatile LONG
36 // this LONG * cast serves to fix an issue with broken mingw
37 // packages on Ubuntu; these only declare the function to take
38 // a LONG *, causing a compile error here. This seems to be
39 // error- and warn-free on platforms that DO declare
40 // InterlockedIncrement correctly, like mingw on Windows.
41 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement((LONG *) &(counter)))
42 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement((LONG *) &(counter)))
43 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd((LONG *) &(counter), (val)))
44 #elif defined(__GNUC__)
45 #define ALIGN(var) var __attribute__((__aligned__(16)))
46 #define ATOMIC(var) var __attribute__((__aligned__(4)))
47 #define MEMORY_BARRIER (_mm_sfence())
48 //(__sync_synchronize())
49 #define ATOMIC_COUNTER volatile int
50 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
51 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
52 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
53 #elif defined(_MSC_VER)
54 #define ALIGN(var) __declspec(align(16)) var
55 #define ATOMIC(var) __declspec(align(4)) var
56 #define MEMORY_BARRIER (_mm_sfence())
58 #define ATOMIC_COUNTER volatile LONG
59 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
60 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
61 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
66 #define ALIGN(var) var
69 #define ATOMIC(var) var
71 #ifndef MEMORY_BARRIER
72 #define MEMORY_BARRIER ((void)0)
74 #ifndef ATOMIC_COUNTER
75 #define ATOMIC_COUNTER int
77 #ifndef ATOMIC_INCREMENT
78 #define ATOMIC_INCREMENT(counter) (++(counter))
80 #ifndef ATOMIC_DECREMENT
81 #define ATOMIC_DECREMENT(counter) (--(counter))
84 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
88 #include <emmintrin.h>
90 #if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6) && !defined(__clang__)
91 #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
94 #define MM_MALLOC(size) _mm_malloc(size, ALIGN_SIZE)
96 static void *MM_CALLOC(size_t nmemb, size_t size)
98 void *ptr = _mm_malloc(nmemb*size, ALIGN_SIZE);
99 if (ptr != NULL) memset(ptr, 0, nmemb*size);
103 #define MM_FREE _mm_free
105 #define MM_MALLOC(size) malloc(size)
106 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
110 typedef enum DPSOFTRAST_ARRAY_e
112 DPSOFTRAST_ARRAY_POSITION,
113 DPSOFTRAST_ARRAY_COLOR,
114 DPSOFTRAST_ARRAY_TEXCOORD0,
115 DPSOFTRAST_ARRAY_TEXCOORD1,
116 DPSOFTRAST_ARRAY_TEXCOORD2,
117 DPSOFTRAST_ARRAY_TEXCOORD3,
118 DPSOFTRAST_ARRAY_TEXCOORD4,
119 DPSOFTRAST_ARRAY_TEXCOORD5,
120 DPSOFTRAST_ARRAY_TEXCOORD6,
121 DPSOFTRAST_ARRAY_TEXCOORD7,
122 DPSOFTRAST_ARRAY_TOTAL
126 typedef struct DPSOFTRAST_Texture_s
133 DPSOFTRAST_TEXTURE_FILTER filter;
136 ATOMIC_COUNTER binds;
137 unsigned char *bytes;
138 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
142 #define COMMAND_SIZE ALIGN_SIZE
143 #define COMMAND_ALIGN(var) ALIGN(var)
145 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
147 unsigned char opcode;
148 unsigned short commandsize;
152 enum { DPSOFTRAST_OPCODE_Reset = 0 };
154 #define DEFCOMMAND(opcodeval, name, fields) \
155 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
156 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
158 unsigned char opcode; \
159 unsigned short commandsize; \
161 } DPSOFTRAST_Command_##name );
163 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
164 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
166 typedef ALIGN(struct DPSOFTRAST_State_Command_Pool_s
170 ALIGN(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
172 DPSOFTRAST_State_Command_Pool);
174 typedef ALIGN(struct DPSOFTRAST_State_Triangle_s
176 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
178 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
180 DPSOFTRAST_State_Triangle);
182 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
183 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
184 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
185 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
186 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
188 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
189 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
190 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
191 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
192 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
193 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
194 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
195 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
196 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
199 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
201 typedef ALIGN(struct DPSOFTRAST_State_Span_s
203 int triangle; // triangle this span was generated by
204 int x; // framebuffer x coord
205 int y; // framebuffer y coord
206 int startx; // usable range (according to pixelmask)
207 int endx; // usable range (according to pixelmask)
208 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
209 int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
210 int depthslope; // depthbuffer value pixel delta
212 DPSOFTRAST_State_Span);
214 #define DPSOFTRAST_DRAW_MAXSPANS 1024
215 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
216 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
218 #define DPSOFTRAST_VALIDATE_FB 1
219 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
220 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
221 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
223 typedef enum DPSOFTRAST_BLENDMODE_e
225 DPSOFTRAST_BLENDMODE_OPAQUE,
226 DPSOFTRAST_BLENDMODE_ALPHA,
227 DPSOFTRAST_BLENDMODE_ADDALPHA,
228 DPSOFTRAST_BLENDMODE_ADD,
229 DPSOFTRAST_BLENDMODE_INVMOD,
230 DPSOFTRAST_BLENDMODE_MUL,
231 DPSOFTRAST_BLENDMODE_MUL2,
232 DPSOFTRAST_BLENDMODE_SUBALPHA,
233 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
234 DPSOFTRAST_BLENDMODE_INVADD,
235 DPSOFTRAST_BLENDMODE_TOTAL
237 DPSOFTRAST_BLENDMODE;
239 typedef ALIGN(struct DPSOFTRAST_State_Thread_s
255 float polygonoffset[2];
257 ALIGN(float fb_clipplane[4]);
260 int shader_permutation;
261 int shader_exactspecularmath;
263 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
265 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
266 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
268 // DPSOFTRAST_VALIDATE_ flags
271 // derived values (DPSOFTRAST_VALIDATE_FB)
274 ALIGN(float fb_viewportcenter[4]);
275 ALIGN(float fb_viewportscale[4]);
277 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
280 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
289 ATOMIC(volatile int commandoffset);
291 volatile bool waiting;
292 volatile bool starving;
299 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
300 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
301 unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
303 DPSOFTRAST_State_Thread);
305 typedef ALIGN(struct DPSOFTRAST_State_s
309 unsigned int *fb_depthpixels;
310 unsigned int *fb_colorpixels[4];
313 ALIGN(float fb_viewportcenter[4]);
314 ALIGN(float fb_viewportscale[4]);
317 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
318 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
320 const float *pointer_vertex3f;
321 const float *pointer_color4f;
322 const unsigned char *pointer_color4ub;
323 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
326 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
327 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
328 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
332 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
333 float *screencoord4f;
339 int shader_permutation;
340 int shader_exactspecularmath;
344 int texture_firstfree;
345 DPSOFTRAST_Texture *texture;
350 const char *errorstring;
355 DPSOFTRAST_State_Thread *threads;
357 ATOMIC(volatile int drawcommand);
359 DPSOFTRAST_State_Command_Pool commandpool;
363 DPSOFTRAST_State dpsoftrast;
365 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
366 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
367 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
368 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
370 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
371 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
373 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
375 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
376 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
377 fb_viewportcenter[3] = 0.5f;
378 fb_viewportcenter[0] = 0.0f;
379 fb_viewportscale[1] = 0.5f * viewport[2];
380 fb_viewportscale[2] = -0.5f * viewport[3];
381 fb_viewportscale[3] = 0.5f;
382 fb_viewportscale[0] = 1.0f;
385 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
387 if (dpsoftrast.interlace)
389 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
390 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
391 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
392 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
396 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
397 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
401 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
403 thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
404 thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
405 thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
406 thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
407 thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
410 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
412 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
413 // and viewport projection values
416 x1 = thread->scissor[0];
417 x2 = thread->scissor[0] + thread->scissor[2];
418 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
419 y2 = dpsoftrast.fb_height - thread->scissor[1];
420 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
422 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
424 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
425 thread->fb_scissor[0] = x1;
426 thread->fb_scissor[1] = y1;
427 thread->fb_scissor[2] = x2 - x1;
428 thread->fb_scissor[3] = y2 - y1;
430 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
431 DPSOFTRAST_RecalcClipPlane(thread);
432 DPSOFTRAST_RecalcThread(thread);
435 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
437 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
440 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
442 if (thread->blendsubtract)
444 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
446 #define BLENDFUNC(sfactor, dfactor, blendmode) \
447 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
448 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
449 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
454 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
456 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
457 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
458 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
459 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
460 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
461 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
462 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
463 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
464 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
465 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
466 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
471 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
473 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
475 mask &= thread->validate;
478 if (mask & DPSOFTRAST_VALIDATE_FB)
480 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
481 DPSOFTRAST_RecalcFB(thread);
483 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
485 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
486 DPSOFTRAST_RecalcDepthFunc(thread);
488 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
490 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
491 DPSOFTRAST_RecalcBlendFunc(thread);
495 static DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
497 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
498 return &dpsoftrast.texture[index];
502 static void DPSOFTRAST_Texture_Grow(void)
504 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
505 DPSOFTRAST_State_Thread *thread;
509 // expand texture array as needed
510 if (dpsoftrast.texture_max < 1024)
511 dpsoftrast.texture_max = 1024;
513 dpsoftrast.texture_max *= 2;
514 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
515 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
516 if (dpsoftrast.texbound[i])
517 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
518 for (j = 0; j < dpsoftrast.numthreads; j++)
520 thread = &dpsoftrast.threads[j];
521 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
522 if (thread->texbound[i])
523 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
527 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
536 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
537 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
538 DPSOFTRAST_Texture *texture;
539 if (width*height*depth < 1)
541 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
544 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
546 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
551 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
552 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
553 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
555 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
556 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
558 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
563 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
566 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
568 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
573 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
575 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
578 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
580 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
583 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
585 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
588 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
590 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
593 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
595 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
598 // find first empty slot in texture array
599 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
600 if (!dpsoftrast.texture[texnum].bytes)
602 dpsoftrast.texture_firstfree = texnum + 1;
603 if (dpsoftrast.texture_max <= texnum)
604 DPSOFTRAST_Texture_Grow();
605 if (dpsoftrast.texture_end <= texnum)
606 dpsoftrast.texture_end = texnum + 1;
607 texture = &dpsoftrast.texture[texnum];
608 memset(texture, 0, sizeof(*texture));
609 texture->flags = flags;
610 texture->width = width;
611 texture->height = height;
612 texture->depth = depth;
613 texture->sides = sides;
625 s = w * h * d * sides * 4;
626 texture->mipmap[mipmaps][0] = size;
627 texture->mipmap[mipmaps][1] = s;
628 texture->mipmap[mipmaps][2] = w;
629 texture->mipmap[mipmaps][3] = h;
630 texture->mipmap[mipmaps][4] = d;
633 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
639 texture->mipmaps = mipmaps;
640 texture->size = size;
642 // allocate the pixels now
643 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
647 void DPSOFTRAST_Texture_Free(int index)
649 DPSOFTRAST_Texture *texture;
650 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
654 MM_FREE(texture->bytes);
655 texture->bytes = NULL;
656 memset(texture, 0, sizeof(*texture));
657 // adjust the free range and used range
658 if (dpsoftrast.texture_firstfree > index)
659 dpsoftrast.texture_firstfree = index;
660 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
661 dpsoftrast.texture_end--;
663 static void DPSOFTRAST_Texture_CalculateMipmaps(int index)
665 int i, x, y, z, w, layer0, layer1, row0, row1;
666 unsigned char *o, *i0, *i1, *i2, *i3;
667 DPSOFTRAST_Texture *texture;
668 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
669 if (texture->mipmaps <= 1)
671 for (i = 1;i < texture->mipmaps;i++)
673 for (z = 0;z < texture->mipmap[i][4];z++)
677 if (layer1 >= texture->mipmap[i-1][4])
678 layer1 = texture->mipmap[i-1][4]-1;
679 for (y = 0;y < texture->mipmap[i][3];y++)
683 if (row1 >= texture->mipmap[i-1][3])
684 row1 = texture->mipmap[i-1][3]-1;
685 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
686 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
687 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
688 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
689 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
690 w = texture->mipmap[i][2];
693 if (texture->mipmap[i-1][2] > 1)
695 // average 3D texture
696 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
698 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
699 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
700 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
701 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
706 // average 3D mipmap with parent width == 1
707 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
709 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
710 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
711 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
712 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
718 if (texture->mipmap[i-1][2] > 1)
720 // average 2D texture (common case)
721 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
723 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
724 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
725 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
726 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
731 // 2D texture with parent width == 1
732 o[0] = (i0[0] + i1[0] + 1) >> 1;
733 o[1] = (i0[1] + i1[1] + 1) >> 1;
734 o[2] = (i0[2] + i1[2] + 1) >> 1;
735 o[3] = (i0[3] + i1[3] + 1) >> 1;
742 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
744 DPSOFTRAST_Texture *texture;
746 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
751 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
752 while (blockheight > 0)
754 memcpy(dst, pixels, blockwidth * 4);
755 pixels += blockwidth * 4;
756 dst += texture->mipmap[0][2] * 4;
760 DPSOFTRAST_Texture_CalculateMipmaps(index);
762 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
764 DPSOFTRAST_Texture *texture;
765 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
769 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
770 DPSOFTRAST_Texture_CalculateMipmaps(index);
772 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
774 DPSOFTRAST_Texture *texture;
775 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
776 return texture->mipmap[mip][2];
778 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
780 DPSOFTRAST_Texture *texture;
781 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
782 return texture->mipmap[mip][3];
784 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
786 DPSOFTRAST_Texture *texture;
787 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
788 return texture->mipmap[mip][4];
790 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
792 DPSOFTRAST_Texture *texture;
793 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
796 return texture->bytes + texture->mipmap[mip][0];
798 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
800 DPSOFTRAST_Texture *texture;
801 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
802 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
804 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
809 texture->filter = filter;
812 static void DPSOFTRAST_Draw_FlushThreads(void);
814 static void DPSOFTRAST_Draw_SyncCommands(void)
816 if(dpsoftrast.usethreads) MEMORY_BARRIER;
817 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
820 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
822 DPSOFTRAST_State_Thread *thread;
824 int freecommand = dpsoftrast.commandpool.freecommand;
825 int usedcommands = dpsoftrast.commandpool.usedcommands;
826 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
828 DPSOFTRAST_Draw_SyncCommands();
834 for (i = 0; i < dpsoftrast.numthreads; i++)
836 thread = &dpsoftrast.threads[i];
837 commandoffset = freecommand - thread->commandoffset;
838 if (commandoffset < 0)
839 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
840 if (commandoffset > usedcommands)
843 usedcommands = commandoffset;
846 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
848 thread = &dpsoftrast.threads[waitindex];
849 Thread_LockMutex(thread->drawmutex);
850 if (thread->commandoffset != dpsoftrast.drawcommand)
852 thread->waiting = true;
853 if (thread->starving) Thread_CondSignal(thread->drawcond);
854 Thread_CondWait(thread->waitcond, thread->drawmutex);
855 thread->waiting = false;
857 Thread_UnlockMutex(thread->drawmutex);
859 dpsoftrast.commandpool.usedcommands = usedcommands;
862 #define DPSOFTRAST_ALIGNCOMMAND(size) \
863 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
864 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
865 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
867 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
869 DPSOFTRAST_Command *command;
870 int freecommand = dpsoftrast.commandpool.freecommand;
871 int usedcommands = dpsoftrast.commandpool.usedcommands;
872 int extra = sizeof(DPSOFTRAST_Command);
873 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
874 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
875 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
877 if (dpsoftrast.usethreads)
878 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
880 DPSOFTRAST_Draw_FlushThreads();
881 freecommand = dpsoftrast.commandpool.freecommand;
882 usedcommands = dpsoftrast.commandpool.usedcommands;
884 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
886 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
887 command->opcode = DPSOFTRAST_OPCODE_Reset;
888 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
891 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
892 command->opcode = opcode;
893 command->commandsize = size;
895 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
897 dpsoftrast.commandpool.freecommand = freecommand;
898 dpsoftrast.commandpool.usedcommands = usedcommands + size;
902 static void DPSOFTRAST_UndoCommand(int size)
904 int freecommand = dpsoftrast.commandpool.freecommand;
905 int usedcommands = dpsoftrast.commandpool.usedcommands;
908 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
909 usedcommands -= size;
910 dpsoftrast.commandpool.freecommand = freecommand;
911 dpsoftrast.commandpool.usedcommands = usedcommands;
914 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
915 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
917 thread->viewport[0] = command->x;
918 thread->viewport[1] = command->y;
919 thread->viewport[2] = command->width;
920 thread->viewport[3] = command->height;
921 thread->validate |= DPSOFTRAST_VALIDATE_FB;
923 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
925 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
928 command->width = width;
929 command->height = height;
931 dpsoftrast.viewport[0] = x;
932 dpsoftrast.viewport[1] = y;
933 dpsoftrast.viewport[2] = width;
934 dpsoftrast.viewport[3] = height;
935 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
938 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
939 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
941 int i, x1, y1, x2, y2, w, h, x, y;
942 int miny1, maxy1, miny2, maxy2;
946 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
947 miny1 = thread->miny1;
948 maxy1 = thread->maxy1;
949 miny2 = thread->miny2;
950 maxy2 = thread->maxy2;
951 x1 = thread->fb_scissor[0];
952 y1 = thread->fb_scissor[1];
953 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
954 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
955 if (y1 < miny1) y1 = miny1;
956 if (y2 > maxy2) y2 = maxy2;
961 // FIXME: honor fb_colormask?
962 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
963 for (i = 0;i < 4;i++)
965 if (!dpsoftrast.fb_colorpixels[i])
967 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
970 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
971 for (x = x1;x < x2;x++)
976 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
978 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
985 DEFCOMMAND(3, ClearDepth, float depth;)
986 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
988 int x1, y1, x2, y2, w, h, x, y;
989 int miny1, maxy1, miny2, maxy2;
993 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
994 miny1 = thread->miny1;
995 maxy1 = thread->maxy1;
996 miny2 = thread->miny2;
997 maxy2 = thread->maxy2;
998 x1 = thread->fb_scissor[0];
999 y1 = thread->fb_scissor[1];
1000 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
1001 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
1002 if (y1 < miny1) y1 = miny1;
1003 if (y2 > maxy2) y2 = maxy2;
1008 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
1009 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
1010 for (;y < bandy;y++)
1012 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
1013 for (x = x1;x < x2;x++)
1017 void DPSOFTRAST_ClearDepth(float d)
1019 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1023 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1024 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1026 thread->colormask[0] = command->r != 0;
1027 thread->colormask[1] = command->g != 0;
1028 thread->colormask[2] = command->b != 0;
1029 thread->colormask[3] = command->a != 0;
1030 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1032 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1034 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1041 DEFCOMMAND(5, DepthTest, int enable;)
1042 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1044 thread->depthtest = command->enable;
1045 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1047 void DPSOFTRAST_DepthTest(int enable)
1049 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1050 command->enable = enable;
1053 DEFCOMMAND(6, ScissorTest, int enable;)
1054 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1056 thread->scissortest = command->enable;
1057 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1059 void DPSOFTRAST_ScissorTest(int enable)
1061 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1062 command->enable = enable;
1065 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1066 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1068 thread->scissor[0] = command->x;
1069 thread->scissor[1] = command->y;
1070 thread->scissor[2] = command->width;
1071 thread->scissor[3] = command->height;
1072 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1074 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1076 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1079 command->width = width;
1080 command->height = height;
1083 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1084 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1086 thread->blendfunc[0] = command->sfactor;
1087 thread->blendfunc[1] = command->dfactor;
1088 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1090 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1092 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1093 command->sfactor = sfactor;
1094 command->dfactor = dfactor;
1097 DEFCOMMAND(9, BlendSubtract, int enable;)
1098 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1100 thread->blendsubtract = command->enable;
1101 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1103 void DPSOFTRAST_BlendSubtract(int enable)
1105 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1106 command->enable = enable;
1109 DEFCOMMAND(10, DepthMask, int enable;)
1110 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1112 thread->depthmask = command->enable;
1114 void DPSOFTRAST_DepthMask(int enable)
1116 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1117 command->enable = enable;
1120 DEFCOMMAND(11, DepthFunc, int func;)
1121 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1123 thread->depthfunc = command->func;
1125 void DPSOFTRAST_DepthFunc(int func)
1127 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1128 command->func = func;
1131 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1132 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1134 thread->depthrange[0] = command->nearval;
1135 thread->depthrange[1] = command->farval;
1137 void DPSOFTRAST_DepthRange(float nearval, float farval)
1139 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1140 command->nearval = nearval;
1141 command->farval = farval;
1144 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1145 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1147 thread->polygonoffset[0] = command->alongnormal;
1148 thread->polygonoffset[1] = command->intoview;
1150 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1152 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1153 command->alongnormal = alongnormal;
1154 command->intoview = intoview;
1157 DEFCOMMAND(14, CullFace, int mode;)
1158 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1160 thread->cullface = command->mode;
1162 void DPSOFTRAST_CullFace(int mode)
1164 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1165 command->mode = mode;
1168 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1170 dpsoftrast.color[0] = r;
1171 dpsoftrast.color[1] = g;
1172 dpsoftrast.color[2] = b;
1173 dpsoftrast.color[3] = a;
1176 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1178 int outstride = blockwidth * 4;
1179 int instride = dpsoftrast.fb_width * 4;
1182 int bx2 = blockx + blockwidth;
1183 int by2 = blocky + blockheight;
1187 unsigned char *inpixels;
1191 if (bx1 < 0) bx1 = 0;
1192 if (by1 < 0) by1 = 0;
1193 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1194 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1196 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1197 if (dpsoftrast.bigendian)
1199 for (y = by1;y < by2;y++)
1201 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1202 o = (unsigned char *)outpixels + (y - by1) * outstride;
1203 for (x = bx1;x < bx2;x++)
1216 for (y = by1;y < by2;y++)
1218 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1219 o = (unsigned char *)outpixels + (y - by1) * outstride;
1225 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1229 int tx2 = tx + width;
1230 int ty2 = ty + height;
1233 int sx2 = sx + width;
1234 int sy2 = sy + height;
1244 unsigned int *spixels;
1245 unsigned int *tpixels;
1246 DPSOFTRAST_Texture *texture;
1247 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1248 if (mip < 0 || mip >= texture->mipmaps) return;
1250 spixels = dpsoftrast.fb_colorpixels[0];
1251 swidth = dpsoftrast.fb_width;
1252 sheight = dpsoftrast.fb_height;
1253 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1254 twidth = texture->mipmap[mip][2];
1255 theight = texture->mipmap[mip][3];
1256 if (tx1 < 0) tx1 = 0;
1257 if (ty1 < 0) ty1 = 0;
1258 if (tx2 > twidth) tx2 = twidth;
1259 if (ty2 > theight) ty2 = theight;
1260 if (sx1 < 0) sx1 = 0;
1261 if (sy1 < 0) sy1 = 0;
1262 if (sx2 > swidth) sx2 = swidth;
1263 if (sy2 > sheight) sy2 = sheight;
1268 if (tw > sw) tw = sw;
1269 if (th > sh) th = sh;
1270 if (tw < 1 || th < 1)
1272 sy1 = sheight - 1 - sy1;
1273 for (y = 0;y < th;y++)
1274 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1275 if (texture->mipmaps > 1)
1276 DPSOFTRAST_Texture_CalculateMipmaps(index);
1279 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1280 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1282 if (thread->texbound[command->unitnum])
1283 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1284 thread->texbound[command->unitnum] = command->texture;
1286 void DPSOFTRAST_SetTexture(int unitnum, int index)
1288 DPSOFTRAST_Command_SetTexture *command;
1289 DPSOFTRAST_Texture *texture;
1290 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1292 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1295 texture = DPSOFTRAST_Texture_GetByIndex(index);
1296 if (index && !texture)
1298 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1302 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1303 command->unitnum = unitnum;
1304 command->texture = texture;
1306 dpsoftrast.texbound[unitnum] = texture;
1308 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1311 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1313 dpsoftrast.pointer_vertex3f = vertex3f;
1314 dpsoftrast.stride_vertex = stride;
1316 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1318 dpsoftrast.pointer_color4f = color4f;
1319 dpsoftrast.pointer_color4ub = NULL;
1320 dpsoftrast.stride_color = stride;
1322 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1324 dpsoftrast.pointer_color4f = NULL;
1325 dpsoftrast.pointer_color4ub = color4ub;
1326 dpsoftrast.stride_color = stride;
1328 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1330 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1331 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1332 dpsoftrast.stride_texcoord[unitnum] = stride;
1335 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1336 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1338 thread->shader_mode = command->mode;
1339 thread->shader_permutation = command->permutation;
1340 thread->shader_exactspecularmath = command->exactspecularmath;
1342 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1344 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1345 command->mode = mode;
1346 command->permutation = permutation;
1347 command->exactspecularmath = exactspecularmath;
1349 dpsoftrast.shader_mode = mode;
1350 dpsoftrast.shader_permutation = permutation;
1351 dpsoftrast.shader_exactspecularmath = exactspecularmath;
1354 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1355 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1357 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1359 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1361 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1362 command->index = index;
1363 command->val[0] = v0;
1364 command->val[1] = v1;
1365 command->val[2] = v2;
1366 command->val[3] = v3;
1368 dpsoftrast.uniform4f[index*4+0] = v0;
1369 dpsoftrast.uniform4f[index*4+1] = v1;
1370 dpsoftrast.uniform4f[index*4+2] = v2;
1371 dpsoftrast.uniform4f[index*4+3] = v3;
1373 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1375 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1376 command->index = index;
1377 memcpy(command->val, v, sizeof(command->val));
1379 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1382 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1383 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1385 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1387 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1391 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1393 __m128 m0, m1, m2, m3;
1394 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1395 command->index = (DPSOFTRAST_UNIFORM)index;
1396 if (((size_t)v)&(ALIGN_SIZE-1))
1398 m0 = _mm_loadu_ps(v);
1399 m1 = _mm_loadu_ps(v+4);
1400 m2 = _mm_loadu_ps(v+8);
1401 m3 = _mm_loadu_ps(v+12);
1405 m0 = _mm_load_ps(v);
1406 m1 = _mm_load_ps(v+4);
1407 m2 = _mm_load_ps(v+8);
1408 m3 = _mm_load_ps(v+12);
1412 __m128 t0, t1, t2, t3;
1413 t0 = _mm_unpacklo_ps(m0, m1);
1414 t1 = _mm_unpacklo_ps(m2, m3);
1415 t2 = _mm_unpackhi_ps(m0, m1);
1416 t3 = _mm_unpackhi_ps(m2, m3);
1417 m0 = _mm_movelh_ps(t0, t1);
1418 m1 = _mm_movehl_ps(t1, t0);
1419 m2 = _mm_movelh_ps(t2, t3);
1420 m3 = _mm_movehl_ps(t3, t2);
1422 _mm_store_ps(command->val, m0);
1423 _mm_store_ps(command->val+4, m1);
1424 _mm_store_ps(command->val+8, m2);
1425 _mm_store_ps(command->val+12, m3);
1426 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1427 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1428 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1429 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1434 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1435 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1437 thread->uniform1i[command->index] = command->val;
1439 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1441 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1442 command->index = index;
1445 dpsoftrast.uniform1i[command->index] = i0;
1448 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1449 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1451 memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1452 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1454 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1456 DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1457 command->clipplane[0] = x;
1458 command->clipplane[1] = y;
1459 command->clipplane[2] = z;
1460 command->clipplane[3] = w;
1464 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1466 float *end = dst + size*4;
1467 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1471 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1480 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1487 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1489 float *end = dst + size*4;
1490 if (stride == sizeof(float[3]))
1492 float *end4 = dst + (size&~3)*4;
1493 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1497 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1498 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1499 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1500 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1501 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1502 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1503 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1504 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1505 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1506 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1507 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1508 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1509 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1511 src += 4*sizeof(float[3]);
1518 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1519 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1520 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1521 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1522 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1523 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1524 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1525 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1526 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1527 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1528 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1529 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1530 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1532 src += 4*sizeof(float[3]);
1536 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1540 __m128 v = _mm_loadu_ps((const float *)src);
1541 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1542 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1543 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1544 _mm_store_ps(dst, v);
1553 __m128 v = _mm_load_ps((const float *)src);
1554 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1555 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1556 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1557 _mm_store_ps(dst, v);
1564 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1566 float *end = dst + size*4;
1567 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1568 if (stride == sizeof(float[2]))
1570 float *end2 = dst + (size&~1)*4;
1571 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1575 __m128 v = _mm_loadu_ps((const float *)src);
1576 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1577 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1579 src += 2*sizeof(float[2]);
1586 __m128 v = _mm_load_ps((const float *)src);
1587 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1588 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1590 src += 2*sizeof(float[2]);
1596 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1602 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1604 float *end = dst + size*4;
1605 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1606 if (stride == sizeof(unsigned char[4]))
1608 float *end4 = dst + (size&~3)*4;
1609 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1613 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1614 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1615 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1616 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1617 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1619 src += 4*sizeof(unsigned char[4]);
1626 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1627 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1628 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1629 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1630 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1632 src += 4*sizeof(unsigned char[4]);
1638 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1639 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1645 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1647 float *end = dst + 4*size;
1648 __m128 v = _mm_loadu_ps(src);
1651 _mm_store_ps(dst, v);
1657 static void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1660 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1661 __m128 m0, m1, m2, m3;
1663 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1665 // fast case for identity matrix
1666 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1669 end = out4f + numitems*4;
1670 m0 = _mm_loadu_ps(inmatrix16f);
1671 m1 = _mm_loadu_ps(inmatrix16f + 4);
1672 m2 = _mm_loadu_ps(inmatrix16f + 8);
1673 m3 = _mm_loadu_ps(inmatrix16f + 12);
1674 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1678 __m128 v = _mm_loadu_ps(in4f);
1680 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1681 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1682 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1683 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1692 __m128 v = _mm_load_ps(in4f);
1694 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1695 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1696 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1697 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1706 static void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1708 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1713 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1715 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1716 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1717 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1718 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1721 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1723 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1724 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1725 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1726 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1729 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1732 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1733 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1734 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1735 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1738 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1740 int clipmask = 0xFF;
1741 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1742 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1743 __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1744 __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1745 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1746 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1747 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1748 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1749 #define BBFRONT(k, pos) \
1751 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1752 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1753 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1756 clipmask &= ~(1<<k); \
1757 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1758 minproj = _mm_min_ss(minproj, proj); \
1759 maxproj = _mm_max_ss(maxproj, proj); \
1763 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1764 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1765 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1766 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1767 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1768 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1772 if (clipmask&(1<<k)) \
1774 if (!(clipmask&(1<<(k^1)))) \
1776 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1777 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1778 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1779 minproj = _mm_min_ss(minproj, proj); \
1780 maxproj = _mm_max_ss(maxproj, proj); \
1782 if (!(clipmask&(1<<(k^2)))) \
1784 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1785 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1786 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1787 minproj = _mm_min_ss(minproj, proj); \
1788 maxproj = _mm_max_ss(maxproj, proj); \
1790 if (!(clipmask&(1<<(k^4)))) \
1792 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1793 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1794 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1795 minproj = _mm_min_ss(minproj, proj); \
1796 maxproj = _mm_max_ss(maxproj, proj); \
1800 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1801 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1802 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1803 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1804 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1805 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1806 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1807 *starty = _mm_cvttss_si32(maxproj);
1808 *endy = _mm_cvttss_si32(minproj)+1;
1812 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1814 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1815 float *end = out4f + numitems*4;
1816 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1817 __m128 minpos, maxpos;
1818 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1820 minpos = maxpos = _mm_loadu_ps(in4f);
1823 __m128 v = _mm_loadu_ps(in4f);
1824 minpos = _mm_min_ps(minpos, v);
1825 maxpos = _mm_max_ps(maxpos, v);
1826 _mm_store_ps(out4f, v);
1827 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1828 _mm_store_ps(screen4f, v);
1836 minpos = maxpos = _mm_load_ps(in4f);
1839 __m128 v = _mm_load_ps(in4f);
1840 minpos = _mm_min_ps(minpos, v);
1841 maxpos = _mm_max_ps(maxpos, v);
1842 _mm_store_ps(out4f, v);
1843 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1844 _mm_store_ps(screen4f, v);
1852 ALIGN(float minposf[4]);
1853 ALIGN(float maxposf[4]);
1854 _mm_store_ps(minposf, minpos);
1855 _mm_store_ps(maxposf, maxpos);
1856 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1861 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1863 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1864 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1866 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1867 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1868 end = out4f + numitems*4;
1869 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1870 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1871 m0 = _mm_loadu_ps(inmatrix16f);
1872 m1 = _mm_loadu_ps(inmatrix16f + 4);
1873 m2 = _mm_loadu_ps(inmatrix16f + 8);
1874 m3 = _mm_loadu_ps(inmatrix16f + 12);
1875 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1877 minpos = maxpos = _mm_loadu_ps(in4f);
1880 __m128 v = _mm_loadu_ps(in4f);
1881 minpos = _mm_min_ps(minpos, v);
1882 maxpos = _mm_max_ps(maxpos, v);
1883 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1884 _mm_store_ps(out4f, v);
1885 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1886 _mm_store_ps(screen4f, v);
1894 minpos = maxpos = _mm_load_ps(in4f);
1897 __m128 v = _mm_load_ps(in4f);
1898 minpos = _mm_min_ps(minpos, v);
1899 maxpos = _mm_max_ps(maxpos, v);
1900 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1901 _mm_store_ps(out4f, v);
1902 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1903 _mm_store_ps(screen4f, v);
1911 ALIGN(float minposf[4]);
1912 ALIGN(float maxposf[4]);
1913 _mm_store_ps(minposf, minpos);
1914 _mm_store_ps(maxposf, maxpos);
1915 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f);
1921 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1924 float *outf = dpsoftrast.post_array4f[outarray];
1925 const unsigned char *inb;
1926 int firstvertex = dpsoftrast.firstvertex;
1927 int numvertices = dpsoftrast.numvertices;
1931 case DPSOFTRAST_ARRAY_POSITION:
1932 stride = dpsoftrast.stride_vertex;
1933 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1934 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1936 case DPSOFTRAST_ARRAY_COLOR:
1937 stride = dpsoftrast.stride_color;
1938 if (dpsoftrast.pointer_color4f)
1940 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1941 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1943 else if (dpsoftrast.pointer_color4ub)
1945 stride = dpsoftrast.stride_color;
1946 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1947 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1951 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1955 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1956 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1958 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1959 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1962 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1965 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1968 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1980 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1982 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1983 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1988 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1991 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1992 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
2000 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2003 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2004 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2011 static void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2014 int startx = span->startx;
2015 int endx = span->endx;
2016 float wslope = triangle->w[0];
2017 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2018 float endz = 1.0f / (w + wslope * startx);
2019 if (triangle->w[0] == 0)
2021 // LordHavoc: fast flat polygons (HUD/menu)
2022 for (x = startx;x < endx;x++)
2026 for (x = startx;x < endx;)
2028 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2030 if (nextsub >= endx) nextsub = endsub = endx-1;
2031 endz = 1.0f / (w + wslope * nextsub);
2032 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2033 for (; x <= endsub; x++, z += dz)
2038 static void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2042 int startx = span->startx;
2043 int endx = span->endx;
2046 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2047 unsigned char * RESTRICT pixelmask = span->pixelmask;
2048 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2049 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2052 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2053 pixeli += span->y * dpsoftrast.fb_width + span->x;
2054 // handle alphatest now (this affects depth writes too)
2055 if (thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL)
2056 for (x = startx;x < endx;x++)
2057 if (in4ub[x*4+3] < 128)
2058 pixelmask[x] = false;
2059 // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2060 // helps sprites, text and hud artwork
2061 switch(thread->fb_blendmode)
2063 case DPSOFTRAST_BLENDMODE_ALPHA:
2064 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2065 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2067 for (x = startx;x < endx;x++)
2069 if (in4ub[x*4+3] >= 1)
2074 while (++x < endx && in4ub[x*4+3] >= 1) ;
2076 if (x >= endx) break;
2078 while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2079 if (x >= endx) break;
2086 case DPSOFTRAST_BLENDMODE_OPAQUE:
2087 case DPSOFTRAST_BLENDMODE_ADD:
2088 case DPSOFTRAST_BLENDMODE_INVMOD:
2089 case DPSOFTRAST_BLENDMODE_MUL:
2090 case DPSOFTRAST_BLENDMODE_MUL2:
2091 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2092 case DPSOFTRAST_BLENDMODE_INVADD:
2095 // put some special values at the end of the mask to ensure the loops end
2096 pixelmask[endx] = 1;
2097 pixelmask[endx+1] = 0;
2098 // LordHavoc: use a double loop to identify subspans, this helps the
2099 // optimized copy/blend loops to perform at their best, most triangles
2100 // have only one run of pixels, and do the search using wide reads...
2104 // if this pixel is masked off, it's probably not alone...
2111 // the 4-item search must be aligned or else it stalls badly
2112 if ((x & 3) && !pixelmask[x])
2114 if(pixelmask[x]) goto endmasked;
2118 if(pixelmask[x]) goto endmasked;
2122 if(pixelmask[x]) goto endmasked;
2127 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2131 for (;!pixelmask[x];x++)
2133 // rather than continue the loop, just check the end variable
2138 // find length of subspan
2141 if (subx + 8 < endx)
2145 if(!pixelmask[subx]) goto endunmasked;
2149 if(!pixelmask[subx]) goto endunmasked;
2153 if(!pixelmask[subx]) goto endunmasked;
2158 while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2162 for (;pixelmask[subx];subx++)
2164 // the checks can overshoot, so make sure to clip it...
2168 // now that we know the subspan length... process!
2169 switch(thread->fb_blendmode)
2171 case DPSOFTRAST_BLENDMODE_OPAQUE:
2175 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2180 while (x + 16 <= subx)
2182 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2183 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2184 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2185 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2190 while (x + 4 <= subx)
2192 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2198 pixeli[x+1] = ini[x+1];
2208 case DPSOFTRAST_BLENDMODE_ALPHA:
2209 #define FINISHBLEND(blend2, blend1) \
2210 for (;x + 1 < subx;x += 2) \
2213 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2214 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2216 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2221 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2222 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2224 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2228 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2229 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2231 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2232 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2235 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2237 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2238 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2240 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2241 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2244 case DPSOFTRAST_BLENDMODE_ADD:
2245 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2247 case DPSOFTRAST_BLENDMODE_INVMOD:
2249 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2251 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2254 case DPSOFTRAST_BLENDMODE_MUL:
2255 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2257 case DPSOFTRAST_BLENDMODE_MUL2:
2258 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2260 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2262 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2263 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2265 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2266 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2269 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2271 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2272 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2274 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2275 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2278 case DPSOFTRAST_BLENDMODE_INVADD:
2280 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2282 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2290 static void DPSOFTRAST_Texture2DBGRA8(DPSOFTRAST_Texture *texture, int mip, float x, float y, unsigned char c[4])
2291 // warning: this is SLOW, only use if the optimized per-span functions won't do
2293 const unsigned char * RESTRICT pixelbase;
2294 const unsigned char * RESTRICT pixel[4];
2295 int width = texture->mipmap[mip][2], height = texture->mipmap[mip][3];
2296 int wrapmask[2] = { width-1, height-1 };
2297 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2298 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
2300 unsigned int tc[2] = { x * (width<<12) - 2048, y * (height<<12) - 2048};
2301 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2302 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2303 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2304 int tci[2] = { tc[0]>>12, tc[1]>>12 };
2305 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2306 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2308 tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2309 tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2310 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= wrapmask[0] ? tci1[0] : wrapmask[0]) : 0;
2311 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= wrapmask[1] ? tci1[1] : wrapmask[1]) : 0;
2315 tci[0] &= wrapmask[0];
2316 tci[1] &= wrapmask[1];
2317 tci1[0] &= wrapmask[0];
2318 tci1[1] &= wrapmask[1];
2320 pixel[0] = pixelbase + 4 * (tci[1]*width+tci[0]);
2321 pixel[1] = pixelbase + 4 * (tci[1]*width+tci1[0]);
2322 pixel[2] = pixelbase + 4 * (tci1[1]*width+tci[0]);
2323 pixel[3] = pixelbase + 4 * (tci1[1]*width+tci1[0]);
2324 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
2325 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
2326 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
2327 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
2331 int tci[2] = { x * width, y * height };
2332 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2334 tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2335 tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2339 tci[0] &= wrapmask[0];
2340 tci[1] &= wrapmask[1];
2342 pixel[0] = pixelbase + 4 * (tci[1]*width+tci[0]);
2351 static void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2354 int startx = span->startx;
2355 int endx = span->endx;
2360 float tc[2], endtc[2];
2362 unsigned int tci[2];
2363 unsigned int tci1[2];
2364 unsigned int tcimin[2];
2365 unsigned int tcimax[2];
2370 const unsigned char * RESTRICT pixelbase;
2371 const unsigned char * RESTRICT pixel[4];
2372 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2373 // if no texture is bound, just fill it with white
2376 for (x = startx;x < endx;x++)
2378 out4f[x*4+0] = 1.0f;
2379 out4f[x*4+1] = 1.0f;
2380 out4f[x*4+2] = 1.0f;
2381 out4f[x*4+3] = 1.0f;
2385 mip = triangle->mip[texunitindex];
2386 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2387 // if this mipmap of the texture is 1 pixel, just fill it with that color
2388 if (texture->mipmap[mip][1] == 4)
2390 c[0] = texture->bytes[2] * (1.0f/255.0f);
2391 c[1] = texture->bytes[1] * (1.0f/255.0f);
2392 c[2] = texture->bytes[0] * (1.0f/255.0f);
2393 c[3] = texture->bytes[3] * (1.0f/255.0f);
2394 for (x = startx;x < endx;x++)
2396 out4f[x*4+0] = c[0];
2397 out4f[x*4+1] = c[1];
2398 out4f[x*4+2] = c[2];
2399 out4f[x*4+3] = c[3];
2403 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2404 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2405 flags = texture->flags;
2406 tcscale[0] = texture->mipmap[mip][2];
2407 tcscale[1] = texture->mipmap[mip][3];
2408 tciwidth = texture->mipmap[mip][2];
2411 tcimax[0] = texture->mipmap[mip][2]-1;
2412 tcimax[1] = texture->mipmap[mip][3]-1;
2413 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2414 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2415 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2416 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2422 for (x = startx;x < endx;)
2424 unsigned int subtc[2];
2425 unsigned int substep[2];
2426 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2427 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2428 if (nextsub >= endx)
2430 nextsub = endsub = endx-1;
2431 if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2435 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2436 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2442 substep[0] = (endtc[0] - tc[0]) * subscale;
2443 substep[1] = (endtc[1] - tc[1]) * subscale;
2444 subtc[0] = tc[0] * (1<<12);
2445 subtc[1] = tc[1] * (1<<12);
2448 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2450 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2452 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2453 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2454 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2455 tci[0] = subtc[0]>>12;
2456 tci[1] = subtc[1]>>12;
2457 tci1[0] = tci[0] + 1;
2458 tci1[1] = tci[1] + 1;
2459 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2460 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2461 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2462 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2463 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2464 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2465 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2466 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2467 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2468 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2469 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2470 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2471 out4f[x*4+0] = c[0];
2472 out4f[x*4+1] = c[1];
2473 out4f[x*4+2] = c[2];
2474 out4f[x*4+3] = c[3];
2479 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2481 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2482 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2483 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2484 tci[0] = subtc[0]>>12;
2485 tci[1] = subtc[1]>>12;
2486 tci1[0] = tci[0] + 1;
2487 tci1[1] = tci[1] + 1;
2488 tci[0] &= tciwrapmask[0];
2489 tci[1] &= tciwrapmask[1];
2490 tci1[0] &= tciwrapmask[0];
2491 tci1[1] &= tciwrapmask[1];
2492 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2493 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2494 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2495 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2496 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2497 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2498 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2499 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2500 out4f[x*4+0] = c[0];
2501 out4f[x*4+1] = c[1];
2502 out4f[x*4+2] = c[2];
2503 out4f[x*4+3] = c[3];
2507 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2509 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2511 tci[0] = subtc[0]>>12;
2512 tci[1] = subtc[1]>>12;
2513 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2514 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2515 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2516 c[0] = pixel[0][2] * (1.0f / 255.0f);
2517 c[1] = pixel[0][1] * (1.0f / 255.0f);
2518 c[2] = pixel[0][0] * (1.0f / 255.0f);
2519 c[3] = pixel[0][3] * (1.0f / 255.0f);
2520 out4f[x*4+0] = c[0];
2521 out4f[x*4+1] = c[1];
2522 out4f[x*4+2] = c[2];
2523 out4f[x*4+3] = c[3];
2528 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2530 tci[0] = subtc[0]>>12;
2531 tci[1] = subtc[1]>>12;
2532 tci[0] &= tciwrapmask[0];
2533 tci[1] &= tciwrapmask[1];
2534 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2535 c[0] = pixel[0][2] * (1.0f / 255.0f);
2536 c[1] = pixel[0][1] * (1.0f / 255.0f);
2537 c[2] = pixel[0][0] * (1.0f / 255.0f);
2538 c[3] = pixel[0][3] * (1.0f / 255.0f);
2539 out4f[x*4+0] = c[0];
2540 out4f[x*4+1] = c[1];
2541 out4f[x*4+2] = c[2];
2542 out4f[x*4+3] = c[3];
2549 static void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2553 int startx = span->startx;
2554 int endx = span->endx;
2556 __m128 data, slope, tcscale;
2557 __m128i tcsize, tcmask, tcoffset, tcmax;
2559 __m128i subtc, substep, endsubtc;
2562 int affine; // LordHavoc: optimized affine texturing case
2563 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2564 const unsigned char * RESTRICT pixelbase;
2565 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2566 // if no texture is bound, just fill it with white
2569 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2572 mip = triangle->mip[texunitindex];
2573 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2574 // if this mipmap of the texture is 1 pixel, just fill it with that color
2575 if (texture->mipmap[mip][1] == 4)
2577 unsigned int k = *((const unsigned int *)pixelbase);
2578 for (x = startx;x < endx;x++)
2582 affine = zf[startx] == zf[endx-1];
2583 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2584 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2585 flags = texture->flags;
2586 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2587 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2588 tcscale = _mm_cvtepi32_ps(tcsize);
2589 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2590 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2591 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2593 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2594 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2595 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2596 tcmax = _mm_packs_epi32(tcmask, tcmask);
2597 for (x = startx;x < endx;)
2599 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2600 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2601 if (nextsub >= endx || affine)
2603 nextsub = endsub = endx-1;
2604 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2608 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2610 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2611 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2612 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2613 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2614 substep = _mm_slli_epi32(substep, 1);
2617 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2618 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2620 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2621 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2623 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2624 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2625 tci = _mm_madd_epi16(tci, tcoffset);
2626 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2627 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2628 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2629 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2630 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2631 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2632 fracm = _mm_srli_epi16(subtc, 1);
2633 pix1 = _mm_add_epi16(pix1,
2634 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2635 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2636 pix3 = _mm_add_epi16(pix3,
2637 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2638 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2639 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2640 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2641 pix2 = _mm_add_epi16(pix2,
2642 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2643 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2644 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2648 const unsigned char * RESTRICT ptr1;
2649 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2650 tci = _mm_madd_epi16(tci, tcoffset);
2651 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2652 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2653 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2654 fracm = _mm_srli_epi16(subtc, 1);
2655 pix1 = _mm_add_epi16(pix1,
2656 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2657 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2658 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2659 pix1 = _mm_add_epi16(pix1,
2660 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2661 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2662 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2666 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2668 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2670 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2671 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2672 tci = _mm_madd_epi16(tci, tcoffset);
2673 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2674 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2675 _mm_setzero_si128());
2676 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2677 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2678 _mm_setzero_si128());
2679 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2680 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2681 tci = _mm_madd_epi16(tci, tcoffset);
2682 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2683 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2684 _mm_setzero_si128());
2685 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2686 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2687 _mm_setzero_si128());
2688 fracm = _mm_srli_epi16(subtc, 1);
2689 pix1 = _mm_add_epi16(pix1,
2690 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2691 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2692 pix3 = _mm_add_epi16(pix3,
2693 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2694 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2695 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2696 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2697 pix2 = _mm_add_epi16(pix2,
2698 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2699 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2700 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2704 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2705 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2706 tci = _mm_madd_epi16(tci, tcoffset);
2707 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2708 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2709 _mm_setzero_si128());
2710 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2711 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2712 _mm_setzero_si128());
2713 fracm = _mm_srli_epi16(subtc, 1);
2714 pix1 = _mm_add_epi16(pix1,
2715 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2716 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2717 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2718 pix1 = _mm_add_epi16(pix1,
2719 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2720 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2721 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2727 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2729 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2730 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2731 tci = _mm_madd_epi16(tci, tcoffset);
2732 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2733 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2734 _mm_setzero_si128());
2735 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2736 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2737 _mm_setzero_si128());
2738 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2739 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2740 tci = _mm_madd_epi16(tci, tcoffset);
2741 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2742 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2743 _mm_setzero_si128());
2744 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2745 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2746 _mm_setzero_si128());
2747 fracm = _mm_srli_epi16(subtc, 1);
2748 pix1 = _mm_add_epi16(pix1,
2749 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2750 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2751 pix3 = _mm_add_epi16(pix3,
2752 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2753 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2754 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2755 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2756 pix2 = _mm_add_epi16(pix2,
2757 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2758 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2759 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2763 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2764 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2765 tci = _mm_madd_epi16(tci, tcoffset);
2766 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2767 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2768 _mm_setzero_si128());
2769 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2770 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2771 _mm_setzero_si128());
2772 fracm = _mm_srli_epi16(subtc, 1);
2773 pix1 = _mm_add_epi16(pix1,
2774 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2775 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2776 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2777 pix1 = _mm_add_epi16(pix1,
2778 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2779 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2780 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2787 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2789 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2791 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2792 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2793 tci = _mm_madd_epi16(tci, tcoffset);
2794 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2795 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2799 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2800 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2801 tci = _mm_madd_epi16(tci, tcoffset);
2802 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2808 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2810 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2811 tci = _mm_and_si128(tci, tcmax);
2812 tci = _mm_madd_epi16(tci, tcoffset);
2813 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2814 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2818 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2819 tci = _mm_and_si128(tci, tcmax);
2820 tci = _mm_madd_epi16(tci, tcoffset);
2821 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2830 static void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2833 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2836 static float DPSOFTRAST_SampleShadowmap(const float *vector)
2843 static void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2846 int startx = span->startx;
2847 int endx = span->endx;
2852 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2853 for (x = startx;x < endx;x++)
2856 c[0] = (data[0] + slope[0]*x) * z;
2857 c[1] = (data[1] + slope[1]*x) * z;
2858 c[2] = (data[2] + slope[2]*x) * z;
2859 c[3] = (data[3] + slope[3]*x) * z;
2860 out4f[x*4+0] = in4f[x*4+0] * c[0];
2861 out4f[x*4+1] = in4f[x*4+1] * c[1];
2862 out4f[x*4+2] = in4f[x*4+2] * c[2];
2863 out4f[x*4+3] = in4f[x*4+3] * c[3];
2869 static void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2872 int startx = span->startx;
2873 int endx = span->endx;
2878 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2879 for (x = startx;x < endx;x++)
2882 c[0] = (data[0] + slope[0]*x) * z;
2883 c[1] = (data[1] + slope[1]*x) * z;
2884 c[2] = (data[2] + slope[2]*x) * z;
2885 c[3] = (data[3] + slope[3]*x) * z;
2886 out4f[x*4+0] = c[0];
2887 out4f[x*4+1] = c[1];
2888 out4f[x*4+2] = c[2];
2889 out4f[x*4+3] = c[3];
2895 static void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2897 int x, startx = span->startx, endx = span->endx;
2898 float c[4], localcolor[4];
2899 localcolor[0] = subcolor[0];
2900 localcolor[1] = subcolor[1];
2901 localcolor[2] = subcolor[2];
2902 localcolor[3] = subcolor[3];
2903 for (x = startx;x < endx;x++)
2905 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2906 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2907 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2908 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2909 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2910 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2911 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2912 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2918 static void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2920 int x, startx = span->startx, endx = span->endx;
2921 for (x = startx;x < endx;x++)
2923 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2924 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2925 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2926 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2932 static void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2934 int x, startx = span->startx, endx = span->endx;
2935 for (x = startx;x < endx;x++)
2937 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2938 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2939 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2940 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2946 static void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2948 int x, startx = span->startx, endx = span->endx;
2950 for (x = startx;x < endx;x++)
2952 a = 1.0f - inb4f[x*4+3];
2954 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2955 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2956 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2957 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2963 static void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2965 int x, startx = span->startx, endx = span->endx;
2966 float localcolor[4], ilerp, lerp;
2967 localcolor[0] = color[0];
2968 localcolor[1] = color[1];
2969 localcolor[2] = color[2];
2970 localcolor[3] = color[3];
2971 ilerp = 1.0f - localcolor[3];
2972 lerp = localcolor[3];
2973 for (x = startx;x < endx;x++)
2975 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2976 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2977 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2978 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2985 static void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2989 int startx = span->startx;
2990 int endx = span->endx;
2993 __m128i submod, substep, endsubmod;
2994 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2995 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2996 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2997 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2998 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2999 for (x = startx; x < endx;)
3001 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3002 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3003 if (nextsub >= endx)
3005 nextsub = endsub = endx-1;
3006 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
3010 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3011 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3012 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3013 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3014 substep = _mm_packs_epi32(substep, substep);
3015 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3017 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3018 pix = _mm_mulhi_epu16(pix, submod);
3019 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3023 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3024 pix = _mm_mulhi_epu16(pix, submod);
3025 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3032 static void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3036 int startx = span->startx;
3037 int endx = span->endx;
3040 __m128i submod, substep, endsubmod;
3041 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3042 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3043 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3044 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3045 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3046 for (x = startx; x < endx;)
3048 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3049 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3050 if (nextsub >= endx)
3052 nextsub = endsub = endx-1;
3053 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3057 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3058 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3059 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3060 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3061 substep = _mm_packs_epi32(substep, substep);
3062 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3064 __m128i pix = _mm_srai_epi16(submod, 4);
3065 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3069 __m128i pix = _mm_srai_epi16(submod, 4);
3070 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3077 static void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3080 int x, startx = span->startx, endx = span->endx;
3081 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3082 localcolor = _mm_packs_epi32(localcolor, localcolor);
3083 for (x = startx;x+2 <= endx;x+=2)
3085 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3086 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3087 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3088 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3092 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3093 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3094 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3095 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3100 static void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3103 int x, startx = span->startx, endx = span->endx;
3104 for (x = startx;x+2 <= endx;x+=2)
3106 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3107 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3108 pix1 = _mm_mulhi_epu16(pix1, pix2);
3109 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3113 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3114 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3115 pix1 = _mm_mulhi_epu16(pix1, pix2);
3116 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3121 static void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3124 int x, startx = span->startx, endx = span->endx;
3125 for (x = startx;x+2 <= endx;x+=2)
3127 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3128 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3129 pix1 = _mm_add_epi16(pix1, pix2);
3130 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3134 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3135 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3136 pix1 = _mm_add_epi16(pix1, pix2);
3137 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3143 static void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3146 int x, startx = span->startx, endx = span->endx;
3147 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3148 tint = _mm_packs_epi32(tint, tint);
3149 for (x = startx;x+2 <= endx;x+=2)
3151 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3152 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3153 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3154 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3158 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3159 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3160 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3161 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3167 static void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3170 int x, startx = span->startx, endx = span->endx;
3171 for (x = startx;x+2 <= endx;x+=2)
3173 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3174 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3175 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3176 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3177 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3181 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3182 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3183 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3184 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3185 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3190 static void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3193 int x, startx = span->startx, endx = span->endx;
3194 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3195 localcolor = _mm_packs_epi32(localcolor, localcolor);
3196 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3197 for (x = startx;x+2 <= endx;x+=2)
3199 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3200 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3201 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3205 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3206 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3207 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3214 static void DPSOFTRAST_VertexShader_Generic(void)
3216 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3217 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3218 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3219 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3220 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3223 static void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3225 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3226 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3227 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3228 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3229 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3230 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3232 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3233 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3234 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3236 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3237 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3240 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3242 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3245 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3247 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3250 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3255 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3256 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3261 static void DPSOFTRAST_VertexShader_PostProcess(void)
3263 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3264 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3265 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3268 static void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3270 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3271 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3272 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3273 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3274 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3275 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3276 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3278 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3279 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3281 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3282 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3284 // TODO: implement saturation
3286 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3288 // TODO: implement gammaramps
3290 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3295 static void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3297 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3300 static void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3302 // this is never called (because colormask is off when this shader is used)
3303 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3304 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3305 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3306 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3307 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3312 static void DPSOFTRAST_VertexShader_FlatColor(void)
3314 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3315 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3318 static void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3321 unsigned char * RESTRICT pixelmask = span->pixelmask;
3322 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3323 int x, startx = span->startx, endx = span->endx;
3324 __m128i Color_Ambientm;
3325 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3326 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3327 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3328 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3329 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3330 if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3331 pixel = buffer_FragColorbgra8;
3332 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3333 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3334 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3335 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3336 for (x = startx;x < endx;x++)
3339 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3342 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3343 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3344 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3345 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3351 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3352 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3353 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3355 if (pixel == buffer_FragColorbgra8)
3356 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3362 static void DPSOFTRAST_VertexShader_VertexColor(void)
3364 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3365 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3366 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3369 static void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3372 unsigned char * RESTRICT pixelmask = span->pixelmask;
3373 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3374 int x, startx = span->startx, endx = span->endx;
3375 __m128i Color_Ambientm, Color_Diffusem;
3377 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3378 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3379 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3380 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3381 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3382 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3383 if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3384 pixel = buffer_FragColorbgra8;
3385 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3386 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3387 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3388 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3389 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3390 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3391 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3392 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3393 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3394 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3395 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3396 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3397 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3398 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3400 __m128i color, mod, pix;
3401 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3404 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3405 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3406 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3407 data = _mm_add_ps(data, slope);
3408 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3409 data = _mm_add_ps(data, slope);
3410 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3411 data = _mm_add_ps(data, slope);
3412 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3413 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3414 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3415 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3416 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3417 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3423 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3424 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3425 mod = _mm_packs_epi32(mod, mod);
3426 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3427 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3429 if (pixel == buffer_FragColorbgra8)
3430 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3436 static void DPSOFTRAST_VertexShader_Lightmap(void)
3438 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3439 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3440 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3443 static void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3446 unsigned char * RESTRICT pixelmask = span->pixelmask;
3447 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3448 int x, startx = span->startx, endx = span->endx;
3449 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3450 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3451 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3452 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3453 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3454 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3455 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3456 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3457 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3458 if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3459 pixel = buffer_FragColorbgra8;
3460 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3461 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3462 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3463 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3464 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3465 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3466 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3467 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3469 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3470 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3471 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3472 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3473 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3474 for (x = startx;x < endx;x++)
3476 __m128i color, lightmap, glow, pix;
3477 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3480 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3481 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3482 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3483 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3484 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3485 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3486 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3487 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3488 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3489 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3495 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3496 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3497 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3498 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3499 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3500 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3505 for (x = startx;x < endx;x++)
3507 __m128i color, lightmap, pix;
3508 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3511 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3512 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3513 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3514 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3515 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3516 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3517 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3523 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3524 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3525 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3526 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3529 if (pixel == buffer_FragColorbgra8)
3530 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3535 void DPSOFTRAST_VertexShader_LightDirection(void);
3536 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3538 static void DPSOFTRAST_VertexShader_FakeLight(void)
3540 DPSOFTRAST_VertexShader_LightDirection();
3543 static void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3545 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3550 static void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3552 DPSOFTRAST_VertexShader_LightDirection();
3553 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3556 static void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3558 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3563 static void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3565 DPSOFTRAST_VertexShader_LightDirection();
3566 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3569 static void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3571 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3576 void DPSOFTRAST_VertexShader_LightDirection(void)
3579 int numvertices = dpsoftrast.numvertices;
3581 float LightVector[4];
3582 float EyePosition[4];
3583 float EyeVectorModelSpace[4];
3589 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3590 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3591 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3592 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3593 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3594 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3595 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3596 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3597 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3598 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3599 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3600 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3601 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3602 for (i = 0;i < numvertices;i++)
3604 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3605 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3606 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3607 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3608 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3609 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3610 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3611 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3612 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3613 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3614 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3615 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3616 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3617 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3618 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3619 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3620 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3621 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3622 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3623 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3624 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3625 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3626 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3627 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3628 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3629 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3630 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3631 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3632 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3634 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3637 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3638 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3639 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3640 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3641 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3642 #define DPSOFTRAST_Vector3Normalize(v)\
3645 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3656 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3658 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3659 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3660 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3661 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3662 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3663 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3664 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3665 unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3666 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3667 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3668 int x, startx = span->startx, endx = span->endx;
3669 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3670 float LightVectordata[4];
3671 float LightVectorslope[4];
3672 float EyeVectordata[4];
3673 float EyeVectorslope[4];
3674 float VectorSdata[4];
3675 float VectorSslope[4];
3676 float VectorTdata[4];
3677 float VectorTslope[4];
3678 float VectorRdata[4];
3679 float VectorRslope[4];
3681 float diffusetex[4];
3683 float surfacenormal[4];
3684 float lightnormal[4];
3685 float lightnormal_modelspace[4];
3687 float specularnormal[4];
3690 float SpecularPower;
3692 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3693 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3694 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3695 Color_Glow[3] = 0.0f;
3696 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3697 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3698 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3699 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3700 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3701 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3702 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3703 Color_Pants[3] = 0.0f;
3704 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3705 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3706 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3707 Color_Shirt[3] = 0.0f;
3708 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3709 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3710 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3712 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3713 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3715 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3717 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3719 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3721 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3722 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3723 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3724 Color_Diffuse[3] = 0.0f;
3725 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3726 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3727 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3728 LightColor[3] = 0.0f;
3729 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3730 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3731 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3732 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3733 Color_Specular[3] = 0.0f;
3734 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3735 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3736 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3738 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3740 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3741 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3742 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3743 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3744 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3746 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3748 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3749 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3751 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3753 // nothing of this needed
3757 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3760 for (x = startx;x < endx;x++)
3763 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3764 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3765 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3766 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3767 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3769 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3770 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3771 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3772 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3774 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3775 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3776 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3777 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3778 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3779 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3780 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3781 DPSOFTRAST_Vector3Normalize(surfacenormal);
3783 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3785 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3786 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3787 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3788 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3790 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3791 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3792 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3793 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3795 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3796 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3797 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3798 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3800 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3801 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3802 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3803 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3805 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3806 DPSOFTRAST_Vector3Normalize(lightnormal);
3808 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3810 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3811 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3812 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3813 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3816 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3818 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3819 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3820 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3822 float f = 1.0f / 256.0f;
3823 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3824 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3825 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3828 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3830 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3831 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3832 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3833 DPSOFTRAST_Vector3Normalize(lightnormal);
3835 LightColor[0] = 1.0;
3836 LightColor[1] = 1.0;
3837 LightColor[2] = 1.0;
3841 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3842 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3843 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3844 DPSOFTRAST_Vector3Normalize(lightnormal);
3847 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3849 if(thread->shader_exactspecularmath)
3851 // reflect lightnormal at surfacenormal, take the negative of that
3852 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3854 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3855 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3856 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3857 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3859 // dot of this and normalize(EyeVectorFogDepth.xyz)
3860 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3861 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3862 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3863 DPSOFTRAST_Vector3Normalize(eyenormal);
3865 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3869 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3870 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3871 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3872 DPSOFTRAST_Vector3Normalize(eyenormal);
3874 specularnormal[0] = lightnormal[0] + eyenormal[0];
3875 specularnormal[1] = lightnormal[1] + eyenormal[1];
3876 specularnormal[2] = lightnormal[2] + eyenormal[2];
3877 DPSOFTRAST_Vector3Normalize(specularnormal);
3879 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3881 specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
3883 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3885 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3886 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3887 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3888 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3892 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3893 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3894 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3895 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3898 buffer_FragColorbgra8[x*4+0] = d[0];
3899 buffer_FragColorbgra8[x*4+1] = d[1];
3900 buffer_FragColorbgra8[x*4+2] = d[2];
3901 buffer_FragColorbgra8[x*4+3] = d[3];
3904 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3906 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3907 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3908 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3909 Color_Diffuse[3] = 0.0f;
3910 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3911 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3912 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3913 LightColor[3] = 0.0f;
3914 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3916 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3918 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3919 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3920 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3921 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3922 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3924 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3926 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3927 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3929 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3931 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3935 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3938 for (x = startx;x < endx;x++)
3941 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3942 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3943 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3944 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3945 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3946 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3947 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3948 DPSOFTRAST_Vector3Normalize(surfacenormal);
3950 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3952 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3953 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3954 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3955 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3957 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3958 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3959 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3960 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3962 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3963 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3964 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3965 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3967 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3968 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3969 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3970 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3972 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3973 DPSOFTRAST_Vector3Normalize(lightnormal);
3975 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3977 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3978 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3979 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3980 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3983 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3985 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3986 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3987 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3989 float f = 1.0f / 256.0f;
3990 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3991 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3992 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3995 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3997 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3998 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3999 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4000 DPSOFTRAST_Vector3Normalize(lightnormal);
4002 LightColor[0] = 1.0;
4003 LightColor[1] = 1.0;
4004 LightColor[2] = 1.0;
4008 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4009 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4010 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4011 DPSOFTRAST_Vector3Normalize(lightnormal);
4014 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4015 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4017 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4018 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4019 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4020 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4024 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4025 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4026 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4027 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4029 buffer_FragColorbgra8[x*4+0] = d[0];
4030 buffer_FragColorbgra8[x*4+1] = d[1];
4031 buffer_FragColorbgra8[x*4+2] = d[2];
4032 buffer_FragColorbgra8[x*4+3] = d[3];
4037 for (x = startx;x < endx;x++)
4040 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4041 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4042 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4043 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4045 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4047 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4048 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4049 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4050 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4054 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4055 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4056 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4057 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4059 buffer_FragColorbgra8[x*4+0] = d[0];
4060 buffer_FragColorbgra8[x*4+1] = d[1];
4061 buffer_FragColorbgra8[x*4+2] = d[2];
4062 buffer_FragColorbgra8[x*4+3] = d[3];
4065 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4070 static void DPSOFTRAST_VertexShader_LightSource(void)
4073 int numvertices = dpsoftrast.numvertices;
4074 float LightPosition[4];
4075 float LightVector[4];
4076 float LightVectorModelSpace[4];
4077 float EyePosition[4];
4078 float EyeVectorModelSpace[4];
4084 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4085 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4086 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4087 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4088 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4089 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4090 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4091 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4092 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4093 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4094 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4095 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4096 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4097 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4098 for (i = 0;i < numvertices;i++)
4100 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4101 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4102 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4103 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4104 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4105 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4106 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4107 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4108 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4109 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4110 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4111 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4112 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4113 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4114 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4115 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4116 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4117 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
4118 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4119 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4120 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4121 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4122 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4123 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4124 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4125 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4126 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4127 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4128 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4129 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4130 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4131 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4133 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4134 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4137 static void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4140 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4141 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4142 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4143 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4144 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4145 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4146 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4147 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4148 int x, startx = span->startx, endx = span->endx;
4149 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4150 float CubeVectordata[4];
4151 float CubeVectorslope[4];
4152 float LightVectordata[4];
4153 float LightVectorslope[4];
4154 float EyeVectordata[4];
4155 float EyeVectorslope[4];
4157 float diffusetex[4];
4159 float surfacenormal[4];
4160 float lightnormal[4];
4162 float specularnormal[4];
4165 float SpecularPower;
4166 float CubeVector[4];
4169 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4170 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4171 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4172 Color_Glow[3] = 0.0f;
4173 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4174 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4175 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4176 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4177 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4178 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4179 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4180 Color_Diffuse[3] = 0.0f;
4181 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4182 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4183 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4184 Color_Specular[3] = 0.0f;
4185 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4186 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4187 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4188 Color_Pants[3] = 0.0f;
4189 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4190 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4191 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4192 Color_Shirt[3] = 0.0f;
4193 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4194 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4195 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4196 LightColor[3] = 0.0f;
4197 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4198 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4199 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4200 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4201 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4202 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4203 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4204 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4206 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4207 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4209 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4210 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4211 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4213 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4214 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4215 for (x = startx;x < endx;x++)
4218 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4219 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4220 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4221 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4222 if (attenuation < 0.01f)
4224 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4226 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4227 if (attenuation < 0.01f)
4231 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4232 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4233 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4234 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4235 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4237 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4238 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4239 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4240 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4242 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4243 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4244 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4245 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4246 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4247 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4248 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4249 DPSOFTRAST_Vector3Normalize(surfacenormal);
4251 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4252 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4253 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4254 DPSOFTRAST_Vector3Normalize(lightnormal);
4256 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4258 if(thread->shader_exactspecularmath)
4260 // reflect lightnormal at surfacenormal, take the negative of that
4261 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4263 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4264 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4265 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4266 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4268 // dot of this and normalize(EyeVectorFogDepth.xyz)
4269 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4270 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4271 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4272 DPSOFTRAST_Vector3Normalize(eyenormal);
4274 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4278 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4279 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4280 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4281 DPSOFTRAST_Vector3Normalize(eyenormal);
4283 specularnormal[0] = lightnormal[0] + eyenormal[0];
4284 specularnormal[1] = lightnormal[1] + eyenormal[1];
4285 specularnormal[2] = lightnormal[2] + eyenormal[2];
4286 DPSOFTRAST_Vector3Normalize(specularnormal);
4288 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4290 specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
4292 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4294 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4295 attenuation *= (1.0f / 255.0f);
4296 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4297 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4298 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4299 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4303 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4304 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4305 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4306 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4308 buffer_FragColorbgra8[x*4+0] = d[0];
4309 buffer_FragColorbgra8[x*4+1] = d[1];
4310 buffer_FragColorbgra8[x*4+2] = d[2];
4311 buffer_FragColorbgra8[x*4+3] = d[3];
4314 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4316 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4317 for (x = startx;x < endx;x++)
4320 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4321 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4322 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4323 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4324 if (attenuation < 0.01f)
4326 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4328 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4329 if (attenuation < 0.01f)
4333 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4334 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4335 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4336 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4337 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4339 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4340 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4341 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4342 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4344 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4345 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4346 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4347 DPSOFTRAST_Vector3Normalize(surfacenormal);
4349 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4350 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4351 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4352 DPSOFTRAST_Vector3Normalize(lightnormal);
4354 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4355 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4357 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4358 attenuation *= (1.0f / 255.0f);
4359 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4360 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4361 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4362 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4366 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4367 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4368 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4369 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4371 buffer_FragColorbgra8[x*4+0] = d[0];
4372 buffer_FragColorbgra8[x*4+1] = d[1];
4373 buffer_FragColorbgra8[x*4+2] = d[2];
4374 buffer_FragColorbgra8[x*4+3] = d[3];
4379 for (x = startx;x < endx;x++)
4382 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4383 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4384 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4385 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4386 if (attenuation < 0.01f)
4388 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4390 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4391 if (attenuation < 0.01f)
4395 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4396 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4397 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4398 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4399 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4401 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4402 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4403 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4404 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4406 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4408 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4409 attenuation *= (1.0f / 255.0f);
4410 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4411 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4412 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4413 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4417 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4418 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4419 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4420 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4422 buffer_FragColorbgra8[x*4+0] = d[0];
4423 buffer_FragColorbgra8[x*4+1] = d[1];
4424 buffer_FragColorbgra8[x*4+2] = d[2];
4425 buffer_FragColorbgra8[x*4+3] = d[3];
4428 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4434 static void DPSOFTRAST_VertexShader_Refraction(void)
4436 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4437 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4438 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4441 static void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4443 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4445 int x, startx = span->startx, endx = span->endx;
4448 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4449 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4452 float ModelViewProjectionPositiondata[4];
4453 float ModelViewProjectionPositionslope[4];
4456 float ScreenScaleRefractReflect[2];
4457 float ScreenCenterRefractReflect[2];
4458 float DistortScaleRefractReflect[2];
4459 float RefractColor[4];
4461 DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4462 if(!texture) return;
4465 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4466 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4469 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4472 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4473 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4474 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4475 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4476 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4477 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4478 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4479 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4480 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4481 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4484 for (x = startx;x < endx;x++)
4486 float SafeScreenTexCoord[2];
4487 float ScreenTexCoord[2];
4494 // " vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4495 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4497 // " vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4498 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4499 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4501 // " vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4502 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4503 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4504 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4505 DPSOFTRAST_Vector3Normalize(v);
4506 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4507 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4509 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4510 DPSOFTRAST_Texture2DBGRA8(texture, 0, ScreenTexCoord[0], ScreenTexCoord[1], c);
4512 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4513 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4514 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4515 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4518 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4523 static void DPSOFTRAST_VertexShader_Water(void)
4526 int numvertices = dpsoftrast.numvertices;
4527 float EyePosition[4];
4528 float EyeVectorModelSpace[4];
4534 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4535 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4536 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4537 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4538 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4539 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4540 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4541 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4542 for (i = 0;i < numvertices;i++)
4544 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4545 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4546 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4547 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4548 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4549 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4550 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4551 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4552 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4553 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4554 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4555 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4556 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4557 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4558 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4559 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4560 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4561 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4562 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
4563 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
4564 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
4565 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
4567 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4568 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4569 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4573 static void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4575 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4577 int x, startx = span->startx, endx = span->endx;
4580 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4581 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4584 float ModelViewProjectionPositiondata[4];
4585 float ModelViewProjectionPositionslope[4];
4586 float EyeVectordata[4];
4587 float EyeVectorslope[4];
4590 float ScreenScaleRefractReflect[4];
4591 float ScreenCenterRefractReflect[4];
4592 float DistortScaleRefractReflect[4];
4593 float RefractColor[4];
4594 float ReflectColor[4];
4595 float ReflectFactor;
4596 float ReflectOffset;
4598 DPSOFTRAST_Texture *texture_refraction = thread->texbound[GL20TU_REFRACTION];
4599 DPSOFTRAST_Texture *texture_reflection = thread->texbound[GL20TU_REFLECTION];
4600 if(!texture_refraction || !texture_reflection) return;
4603 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4604 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4607 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4608 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
4611 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4612 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4613 ScreenScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+2];
4614 ScreenScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+3];
4615 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4616 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4617 ScreenCenterRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+2];
4618 ScreenCenterRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+3];
4619 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4620 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4621 DistortScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+2];
4622 DistortScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+3];
4623 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4624 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4625 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4626 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4627 ReflectColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+2];
4628 ReflectColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+1];
4629 ReflectColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+0];
4630 ReflectColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+3];
4631 ReflectFactor = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectFactor*4+0];
4632 ReflectOffset = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectOffset*4+0];
4635 for (x = startx;x < endx;x++)
4637 float SafeScreenTexCoord[4];
4638 float ScreenTexCoord[4];
4641 unsigned char c1[4];
4642 unsigned char c2[4];
4647 // " vec4 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect * (1.0 / ModelViewProjectionPosition.w);\n"
4648 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4650 // " vec4 SafeScreenTexCoord = ModelViewProjectionPosition.xyxy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect;\n"
4651 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4652 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4653 SafeScreenTexCoord[2] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[2] + ScreenCenterRefractReflect[2]; // * z (disappears)
4654 SafeScreenTexCoord[3] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[3] + ScreenCenterRefractReflect[3]; // * z (disappears)
4656 // " vec4 ScreenTexCoord = SafeScreenTexCoord + vec2(normalize(vec3(dp_texture2D(Texture_Normal, TexCoord)) - vec3(0.5))).xyxy * DistortScaleRefractReflect;\n"
4657 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4658 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4659 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4660 DPSOFTRAST_Vector3Normalize(v);
4661 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4662 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4663 ScreenTexCoord[2] = SafeScreenTexCoord[2] + v[0] * DistortScaleRefractReflect[2];
4664 ScreenTexCoord[3] = SafeScreenTexCoord[3] + v[1] * DistortScaleRefractReflect[3];
4666 // " float Fresnel = pow(min(1.0, 1.0 - float(normalize(EyeVector).z)), 2.0) * ReflectFactor + ReflectOffset;\n"
4667 v[0] = (EyeVectordata[0] + EyeVectorslope[0] * x); // * z (disappears)
4668 v[1] = (EyeVectordata[1] + EyeVectorslope[1] * x); // * z (disappears)
4669 v[2] = (EyeVectordata[2] + EyeVectorslope[2] * x); // * z (disappears)
4670 DPSOFTRAST_Vector3Normalize(v);
4671 Fresnel = 1.0f - v[2];
4672 Fresnel = min(1.0f, Fresnel);
4673 Fresnel = Fresnel * Fresnel * ReflectFactor + ReflectOffset;
4675 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4676 // " dp_FragColor = mix(vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord.xy).rgb, 1) * RefractColor, vec4(dp_texture2D(Texture_Reflection, ScreenTexCoord.zw).rgb, 1) * ReflectColor, Fresnel);\n"
4677 DPSOFTRAST_Texture2DBGRA8(texture_refraction, 0, ScreenTexCoord[0], ScreenTexCoord[1], c1);
4678 DPSOFTRAST_Texture2DBGRA8(texture_reflection, 0, ScreenTexCoord[2], ScreenTexCoord[3], c2);
4680 buffer_FragColorbgra8[x*4+0] = (c1[0] * RefractColor[0]) * (1.0f - Fresnel) + (c2[0] * ReflectColor[0]) * Fresnel;
4681 buffer_FragColorbgra8[x*4+1] = (c1[1] * RefractColor[1]) * (1.0f - Fresnel) + (c2[1] * ReflectColor[1]) * Fresnel;
4682 buffer_FragColorbgra8[x*4+2] = (c1[2] * RefractColor[2]) * (1.0f - Fresnel) + (c2[2] * ReflectColor[2]) * Fresnel;
4683 buffer_FragColorbgra8[x*4+3] = min(( RefractColor[3] * (1.0f - Fresnel) + ReflectColor[3] * Fresnel) * 256, 255);
4686 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4691 static void DPSOFTRAST_VertexShader_ShowDepth(void)
4693 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4696 static void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4699 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4700 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4701 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4702 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4703 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4708 static void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4710 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4713 static void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4716 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4717 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4718 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4719 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4720 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4725 static void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4727 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4730 static void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4733 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4734 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4735 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4736 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4737 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4742 typedef struct DPSOFTRAST_ShaderModeInfo_s
4745 void (*Vertex)(void);
4746 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4747 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4748 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4750 DPSOFTRAST_ShaderModeInfo;
4752 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4754 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4755 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4756 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4757 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4758 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4759 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4760 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4761 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4762 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4763 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4764 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4765 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4766 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4767 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4768 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_NORMAL, GL20TU_REFLECTION, GL20TU_REFRACTION, ~0}},
4769 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4770 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4771 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4774 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4779 unsigned int *depthpixel;
4783 unsigned char *pixelmask;
4784 DPSOFTRAST_State_Triangle *triangle;
4785 triangle = &thread->triangles[span->triangle];
4786 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4787 startx = span->startx;
4789 depth = span->depthbase;
4790 depthslope = span->depthslope;
4791 pixelmask = thread->pixelmaskarray;
4792 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4794 switch(thread->fb_depthfunc)
4797 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4798 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4799 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4800 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4801 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4802 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4803 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4805 while (startx < endx && !pixelmask[startx])
4807 while (endx > startx && !pixelmask[endx-1])
4812 // no depth testing means we're just dealing with color...
4813 memset(pixelmask + startx, 1, endx - startx);
4815 span->pixelmask = pixelmask;
4816 span->startx = startx;
4820 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4822 int x, d, depth, depthslope, startx, endx;
4823 const unsigned char *pixelmask;
4824 unsigned int *depthpixel;
4825 if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4827 depth = span->depthbase;
4828 depthslope = span->depthslope;
4829 pixelmask = span->pixelmask;
4830 startx = span->startx;
4832 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4833 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4839 static void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4842 DPSOFTRAST_State_Triangle *triangle;
4843 DPSOFTRAST_State_Span *span;
4844 for (i = 0; i < thread->numspans; i++)
4846 span = &thread->spans[i];
4847 triangle = &thread->triangles[span->triangle];
4848 DPSOFTRAST_Draw_DepthTest(thread, span);
4849 if (span->startx >= span->endx)
4851 // run pixel shader if appropriate
4852 // do this before running depthmask code, to allow the pixelshader
4853 // to clear pixelmask values for alpha testing
4854 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4855 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4856 DPSOFTRAST_Draw_DepthWrite(thread, span);
4858 thread->numspans = 0;
4861 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;)
4863 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4866 int cullface = thread->cullface;
4867 int minx, maxx, miny, maxy;
4868 int miny1, maxy1, miny2, maxy2;
4869 __m128i fbmin, fbmax;
4870 __m128 viewportcenter, viewportscale;
4871 int firstvertex = command->firstvertex;
4872 int numvertices = command->numvertices;
4873 int numtriangles = command->numtriangles;
4874 const int *element3i = command->element3i;
4875 const unsigned short *element3s = command->element3s;
4876 int clipped = command->clipped;
4883 int starty, endy, bandy;
4887 float clip0origin, clip0slope;
4889 __m128 triangleedge1, triangleedge2, trianglenormal;
4892 DPSOFTRAST_State_Triangle *triangle;
4893 DPSOFTRAST_Texture *texture;
4894 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4895 miny = thread->fb_scissor[1];
4896 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4897 miny1 = bound(miny, thread->miny1, maxy);
4898 maxy1 = bound(miny, thread->maxy1, maxy);
4899 miny2 = bound(miny, thread->miny2, maxy);
4900 maxy2 = bound(miny, thread->maxy2, maxy);
4901 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4903 if (!ATOMIC_DECREMENT(command->refcount))
4905 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4906 MM_FREE(command->arrays);
4910 minx = thread->fb_scissor[0];
4911 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4912 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4913 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4914 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4915 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4916 screen[3] = _mm_setzero_ps();
4917 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4918 for (i = 0;i < numtriangles;i++)
4920 const float *screencoord4f = command->arrays;
4921 const float *arrays = screencoord4f + numvertices*4;
4923 // generate the 3 edges of this triangle
4924 // generate spans for the triangle - switch based on left split or right split classification of triangle
4927 e[0] = element3s[i*3+0] - firstvertex;
4928 e[1] = element3s[i*3+1] - firstvertex;
4929 e[2] = element3s[i*3+2] - firstvertex;
4933 e[0] = element3i[i*3+0] - firstvertex;
4934 e[1] = element3i[i*3+1] - firstvertex;
4935 e[2] = element3i[i*3+2] - firstvertex;
4944 #define SKIPBACKFACE \
4945 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4946 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4947 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4948 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4949 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4953 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4957 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4962 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4963 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4965 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4966 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4968 #define CLIPPEDVERTEXCOPY(k,p1) \
4969 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4971 #define GENATTRIBCOPY(attrib, p1) \
4972 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4973 #define GENATTRIBLERP(attrib, p1, p2) \
4975 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4976 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4978 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4982 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4983 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4984 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4985 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4986 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4987 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4988 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4994 // calculate distance from nearplane
4995 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4996 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4997 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4998 if (clipdist[0] >= 0.0f)
5000 if (clipdist[1] >= 0.0f)
5002 if (clipdist[2] >= 0.0f)
5005 // triangle is entirely in front of nearplane
5006 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
5013 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
5021 if (clipdist[2] >= 0.0f)
5023 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
5030 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
5037 else if (clipdist[1] >= 0.0f)
5039 if (clipdist[2] >= 0.0f)
5041 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
5048 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
5054 else if (clipdist[2] >= 0.0f)
5056 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
5061 else continue; // triangle is entirely behind nearplane
5064 // calculate integer y coords for triangle points
5065 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
5066 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
5067 screenmin = _mm_min_epi16(screeni, screenir),
5068 screenmax = _mm_max_epi16(screeni, screenir);
5069 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
5070 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
5071 screenmin = _mm_max_epi16(screenmin, fbmin);
5072 screenmax = _mm_min_epi16(screenmax, fbmax);
5073 // skip offscreen triangles
5074 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5076 starty = _mm_extract_epi16(screenmin, 1);
5077 endy = _mm_extract_epi16(screenmax, 1)+1;
5078 if (starty >= maxy1 && endy <= miny2)
5080 screeny = _mm_srai_epi32(screeni, 16);
5083 triangle = &thread->triangles[thread->numtriangles];
5085 // calculate attribute plans for triangle data...
5086 // okay, this triangle is going to produce spans, we'd better project
5087 // the interpolants now (this is what gives perspective texturing),
5088 // this consists of simply multiplying all arrays by the W coord
5089 // (which is basically 1/Z), which will be undone per-pixel
5090 // (multiplying by Z again) to get the perspective-correct array
5093 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5094 __m128 mipedgescale, mipdensity;
5095 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5096 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5097 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5098 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5099 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5100 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5101 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5102 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5103 attribedge1 = _mm_sub_ss(w0, w1);
5104 attribedge2 = _mm_sub_ss(w2, w1);
5105 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5106 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5107 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5108 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5109 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5110 _mm_store_ss(&triangle->w[0], attribxslope);
5111 _mm_store_ss(&triangle->w[1], attribyslope);
5112 _mm_store_ss(&triangle->w[2], attriborigin);
5117 if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
5119 float cliporigin, clipxslope, clipyslope;
5120 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5121 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5122 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5123 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5124 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5125 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5126 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
5127 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
5128 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
5131 clip0origin = -cliporigin/clipxslope;
5132 clip0slope = -clipyslope/clipxslope;
5133 clip0dir = clipxslope > 0 ? 1 : -1;
5135 else if(clipyslope > 0)
5137 clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5138 clip0slope = dpsoftrast.fb_width;
5141 else if(clipyslope < 0)
5143 clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5144 clip0slope = -dpsoftrast.fb_width;
5147 else if(clip0origin < 0) continue;
5150 mipedgescale = _mm_setzero_ps();
5151 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5153 __m128 attrib0, attrib1, attrib2;
5154 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5155 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5157 arrays += numvertices*4;
5158 GENATTRIBS(attrib0, attrib1, attrib2);
5159 attriborigin = _mm_mul_ps(attrib1, w1);
5160 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5161 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5162 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5163 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5164 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5165 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5166 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5167 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5168 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5170 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5171 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5172 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5173 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5177 memset(triangle->mip, 0, sizeof(triangle->mip));
5178 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5180 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5181 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5183 texture = thread->texbound[texunit];
5184 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5186 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5187 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5188 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5189 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5190 // this will be multiplied in the texturing routine by the texture resolution
5191 y = _mm_cvtss_si32(mipdensity);
5194 y = (int)(log((float)y)*0.5f/M_LN2);
5195 if (y > texture->mipmaps - 1)
5196 y = texture->mipmaps - 1;
5197 triangle->mip[texunit] = y;
5203 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5206 __m128 xcoords, xslope;
5207 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5208 int yccmask = _mm_movemask_epi8(ycc);
5209 int edge0p, edge0n, edge1p, edge1n;
5218 case 0xFFFF: /*0000*/ y = endy; continue;
5219 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5220 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5221 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5222 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5223 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5224 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5225 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5226 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5227 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5228 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5229 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5230 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5231 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5232 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5233 case 0x0000: /*1111*/ y++; continue;
5241 case 0xFFFF: /*000*/ y = endy; continue;
5242 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5243 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5244 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5245 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5246 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5247 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5248 case 0x0000: /*111*/ y++; continue;
5251 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5252 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5253 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5254 nexty = _mm_extract_epi16(ycc, 0);
5255 if (nexty >= bandy) nexty = bandy-1;
5256 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5257 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5258 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5259 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5260 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5261 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5263 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5264 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5266 clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5267 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5269 int startx, endx, offset;
5270 startx = _mm_cvtss_si32(xcoords);
5271 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5272 if (startx < minx) startx = minx;
5273 if (endx > maxx) endx = maxx;
5274 if (startx >= endx) continue;
5282 if(endx <= clip0) continue;
5283 startx = (int)clip0;
5286 else if (endx > clip0)
5288 if(startx >= clip0) continue;
5293 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5295 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5296 span->triangle = thread->numtriangles;
5300 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5301 if (span->startx >= span->endx)
5303 wslope = triangle->w[0];
5304 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5305 span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5306 span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5307 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5308 DPSOFTRAST_Draw_ProcessSpans(thread);
5313 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5315 DPSOFTRAST_Draw_ProcessSpans(thread);
5316 thread->numtriangles = 0;
5320 if (!ATOMIC_DECREMENT(command->refcount))
5322 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5323 MM_FREE(command->arrays);
5326 if (thread->numspans > 0 || thread->numtriangles > 0)
5328 DPSOFTRAST_Draw_ProcessSpans(thread);
5329 thread->numtriangles = 0;
5334 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5338 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5339 int datasize = 2*numvertices*sizeof(float[4]);
5340 DPSOFTRAST_Command_Draw *command;
5341 unsigned char *data;
5342 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5344 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5345 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5347 datasize += numvertices*sizeof(float[4]);
5350 datasize += numtriangles*sizeof(unsigned short[3]);
5352 datasize += numtriangles*sizeof(int[3]);
5353 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5354 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5356 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5357 data = (unsigned char *)MM_CALLOC(datasize, 1);
5361 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5362 data = (unsigned char *)command + commandsize;
5364 command->firstvertex = firstvertex;
5365 command->numvertices = numvertices;
5366 command->numtriangles = numtriangles;
5367 command->arrays = (float *)data;
5368 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5369 dpsoftrast.firstvertex = firstvertex;
5370 dpsoftrast.numvertices = numvertices;
5371 dpsoftrast.screencoord4f = (float *)data;
5372 data += numvertices*sizeof(float[4]);
5373 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5374 data += numvertices*sizeof(float[4]);
5375 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5377 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5378 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5380 dpsoftrast.post_array4f[j] = (float *)data;
5381 data += numvertices*sizeof(float[4]);
5383 command->element3i = NULL;
5384 command->element3s = NULL;
5387 command->element3s = (unsigned short *)data;
5388 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5392 command->element3i = (int *)data;
5393 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5398 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5400 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5401 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5402 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5403 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5404 if (command->starty >= command->endy)
5406 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5407 MM_FREE(command->arrays);
5408 DPSOFTRAST_UndoCommand(command->commandsize);
5411 command->clipped = dpsoftrast.drawclipped;
5412 command->refcount = dpsoftrast.numthreads;
5414 if (dpsoftrast.usethreads)
5417 DPSOFTRAST_Draw_SyncCommands();
5418 for (i = 0; i < dpsoftrast.numthreads; i++)
5420 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5421 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5422 Thread_CondSignal(thread->drawcond);
5427 DPSOFTRAST_Draw_FlushThreads();
5431 DEFCOMMAND(23, SetRenderTargets, int width; int height;)
5432 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5434 thread->validate |= DPSOFTRAST_VALIDATE_FB;
5436 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5438 DPSOFTRAST_Command_SetRenderTargets *command;
5439 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5440 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5441 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5443 dpsoftrast.fb_width = width;
5444 dpsoftrast.fb_height = height;
5445 dpsoftrast.fb_depthpixels = depthpixels;
5446 dpsoftrast.fb_colorpixels[0] = colorpixels0;
5447 dpsoftrast.fb_colorpixels[1] = colorpixels1;
5448 dpsoftrast.fb_colorpixels[2] = colorpixels2;
5449 dpsoftrast.fb_colorpixels[3] = colorpixels3;
5450 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5451 command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5452 command->width = width;
5453 command->height = height;
5456 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5458 int commandoffset = thread->commandoffset;
5459 while (commandoffset != endoffset)
5461 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5462 switch (command->opcode)
5464 #define INTERPCOMMAND(name) \
5465 case DPSOFTRAST_OPCODE_##name : \
5466 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5467 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5468 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5469 commandoffset = 0; \
5471 INTERPCOMMAND(Viewport)
5472 INTERPCOMMAND(ClearColor)
5473 INTERPCOMMAND(ClearDepth)
5474 INTERPCOMMAND(ColorMask)
5475 INTERPCOMMAND(DepthTest)
5476 INTERPCOMMAND(ScissorTest)
5477 INTERPCOMMAND(Scissor)
5478 INTERPCOMMAND(BlendFunc)
5479 INTERPCOMMAND(BlendSubtract)
5480 INTERPCOMMAND(DepthMask)
5481 INTERPCOMMAND(DepthFunc)
5482 INTERPCOMMAND(DepthRange)
5483 INTERPCOMMAND(PolygonOffset)
5484 INTERPCOMMAND(CullFace)
5485 INTERPCOMMAND(SetTexture)
5486 INTERPCOMMAND(SetShader)
5487 INTERPCOMMAND(Uniform4f)
5488 INTERPCOMMAND(UniformMatrix4f)
5489 INTERPCOMMAND(Uniform1i)
5490 INTERPCOMMAND(SetRenderTargets)
5491 INTERPCOMMAND(ClipPlane)
5493 case DPSOFTRAST_OPCODE_Draw:
5494 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5495 commandoffset += command->commandsize;
5496 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5498 thread->commandoffset = commandoffset;
5501 case DPSOFTRAST_OPCODE_Reset:
5506 thread->commandoffset = commandoffset;
5509 static int DPSOFTRAST_Draw_Thread(void *data)
5511 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5512 while(thread->index >= 0)
5514 if (thread->commandoffset != dpsoftrast.drawcommand)
5516 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5520 Thread_LockMutex(thread->drawmutex);
5521 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5523 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5524 thread->starving = true;
5525 Thread_CondWait(thread->drawcond, thread->drawmutex);
5526 thread->starving = false;
5528 Thread_UnlockMutex(thread->drawmutex);
5534 static void DPSOFTRAST_Draw_FlushThreads(void)
5536 DPSOFTRAST_State_Thread *thread;
5538 DPSOFTRAST_Draw_SyncCommands();
5539 if (dpsoftrast.usethreads)
5541 for (i = 0; i < dpsoftrast.numthreads; i++)
5543 thread = &dpsoftrast.threads[i];
5544 if (thread->commandoffset != dpsoftrast.drawcommand)
5546 Thread_LockMutex(thread->drawmutex);
5547 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5548 Thread_CondSignal(thread->drawcond);
5549 Thread_UnlockMutex(thread->drawmutex);
5552 for (i = 0; i < dpsoftrast.numthreads; i++)
5554 thread = &dpsoftrast.threads[i];
5555 if (thread->commandoffset != dpsoftrast.drawcommand)
5557 Thread_LockMutex(thread->drawmutex);
5558 if (thread->commandoffset != dpsoftrast.drawcommand)
5560 thread->waiting = true;
5561 Thread_CondWait(thread->waitcond, thread->drawmutex);
5562 thread->waiting = false;
5564 Thread_UnlockMutex(thread->drawmutex);
5570 for (i = 0; i < dpsoftrast.numthreads; i++)
5572 thread = &dpsoftrast.threads[i];
5573 if (thread->commandoffset != dpsoftrast.drawcommand)
5574 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5577 dpsoftrast.commandpool.usedcommands = 0;
5580 void DPSOFTRAST_Flush(void)
5582 DPSOFTRAST_Draw_FlushThreads();
5585 void DPSOFTRAST_Finish(void)
5590 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5600 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5601 dpsoftrast.bigendian = u.b[3];
5602 dpsoftrast.fb_width = width;
5603 dpsoftrast.fb_height = height;
5604 dpsoftrast.fb_depthpixels = depthpixels;
5605 dpsoftrast.fb_colorpixels[0] = colorpixels;
5606 dpsoftrast.fb_colorpixels[1] = NULL;
5607 dpsoftrast.fb_colorpixels[1] = NULL;
5608 dpsoftrast.fb_colorpixels[1] = NULL;
5609 dpsoftrast.viewport[0] = 0;
5610 dpsoftrast.viewport[1] = 0;
5611 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5612 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5613 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5614 dpsoftrast.texture_firstfree = 1;
5615 dpsoftrast.texture_end = 1;
5616 dpsoftrast.texture_max = 0;
5617 dpsoftrast.color[0] = 1;
5618 dpsoftrast.color[1] = 1;
5619 dpsoftrast.color[2] = 1;
5620 dpsoftrast.color[3] = 1;
5621 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5622 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5623 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5624 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5625 for (i = 0; i < dpsoftrast.numthreads; i++)
5627 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5629 thread->cullface = GL_BACK;
5630 thread->colormask[0] = 1;
5631 thread->colormask[1] = 1;
5632 thread->colormask[2] = 1;
5633 thread->colormask[3] = 1;
5634 thread->blendfunc[0] = GL_ONE;
5635 thread->blendfunc[1] = GL_ZERO;
5636 thread->depthmask = true;
5637 thread->depthtest = true;
5638 thread->depthfunc = GL_LEQUAL;
5639 thread->scissortest = false;
5640 thread->viewport[0] = 0;
5641 thread->viewport[1] = 0;
5642 thread->viewport[2] = dpsoftrast.fb_width;
5643 thread->viewport[3] = dpsoftrast.fb_height;
5644 thread->scissor[0] = 0;
5645 thread->scissor[1] = 0;
5646 thread->scissor[2] = dpsoftrast.fb_width;
5647 thread->scissor[3] = dpsoftrast.fb_height;
5648 thread->depthrange[0] = 0;
5649 thread->depthrange[1] = 1;
5650 thread->polygonoffset[0] = 0;
5651 thread->polygonoffset[1] = 0;
5652 thread->clipplane[0] = 0;
5653 thread->clipplane[1] = 0;
5654 thread->clipplane[2] = 0;
5655 thread->clipplane[3] = 1;
5657 thread->numspans = 0;
5658 thread->numtriangles = 0;
5659 thread->commandoffset = 0;
5660 thread->waiting = false;
5661 thread->starving = false;
5663 thread->validate = -1;
5664 DPSOFTRAST_Validate(thread, -1);
5666 if (dpsoftrast.usethreads)
5668 thread->waitcond = Thread_CreateCond();
5669 thread->drawcond = Thread_CreateCond();
5670 thread->drawmutex = Thread_CreateMutex();
5671 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5677 void DPSOFTRAST_Shutdown(void)
5680 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5682 DPSOFTRAST_State_Thread *thread;
5683 for (i = 0; i < dpsoftrast.numthreads; i++)
5685 thread = &dpsoftrast.threads[i];
5686 Thread_LockMutex(thread->drawmutex);
5688 Thread_CondSignal(thread->drawcond);
5689 Thread_UnlockMutex(thread->drawmutex);
5690 Thread_WaitThread(thread->thread, 0);
5691 Thread_DestroyCond(thread->waitcond);
5692 Thread_DestroyCond(thread->drawcond);
5693 Thread_DestroyMutex(thread->drawmutex);
5696 for (i = 0;i < dpsoftrast.texture_end;i++)
5697 if (dpsoftrast.texture[i].bytes)
5698 MM_FREE(dpsoftrast.texture[i].bytes);
5699 if (dpsoftrast.texture)
5700 free(dpsoftrast.texture);
5701 if (dpsoftrast.threads)
5702 MM_FREE(dpsoftrast.threads);
5703 memset(&dpsoftrast, 0, sizeof(dpsoftrast));