3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 #pragma warning(disable : 4324)
14 typedef qboolean bool;
21 #if defined(__APPLE__)
22 #include <libkern/OSAtomic.h>
23 #define ALIGN(var) var __attribute__((__aligned__(16)))
24 #define ATOMIC(var) var __attribute__((__aligned__(4)))
25 #define MEMORY_BARRIER (_mm_sfence())
26 #define ATOMIC_COUNTER volatile int32_t
27 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30 #elif defined(__GNUC__) && defined(WIN32)
31 #define ALIGN(var) var __attribute__((__aligned__(16)))
32 #define ATOMIC(var) var __attribute__((__aligned__(4)))
33 #define MEMORY_BARRIER (_mm_sfence())
34 //(__sync_synchronize())
35 #define ATOMIC_COUNTER volatile LONG
36 // this LONG * cast serves to fix an issue with broken mingw
37 // packages on Ubuntu; these only declare the function to take
38 // a LONG *, causing a compile error here. This seems to be
39 // error- and warn-free on platforms that DO declare
40 // InterlockedIncrement correctly, like mingw on Windows.
41 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement((LONG *) &(counter)))
42 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement((LONG *) &(counter)))
43 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd((LONG *) &(counter), (val)))
44 #elif defined(__GNUC__)
45 #define ALIGN(var) var __attribute__((__aligned__(16)))
46 #define ATOMIC(var) var __attribute__((__aligned__(4)))
47 #define MEMORY_BARRIER (_mm_sfence())
48 //(__sync_synchronize())
49 #define ATOMIC_COUNTER volatile int
50 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
51 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
52 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
53 #elif defined(_MSC_VER)
54 #define ALIGN(var) __declspec(align(16)) var
55 #define ATOMIC(var) __declspec(align(4)) var
56 #define MEMORY_BARRIER (_mm_sfence())
58 #define ATOMIC_COUNTER volatile LONG
59 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
60 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
61 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
66 #define ALIGN(var) var
69 #define ATOMIC(var) var
71 #ifndef MEMORY_BARRIER
72 #define MEMORY_BARRIER ((void)0)
74 #ifndef ATOMIC_COUNTER
75 #define ATOMIC_COUNTER int
77 #ifndef ATOMIC_INCREMENT
78 #define ATOMIC_INCREMENT(counter) (++(counter))
80 #ifndef ATOMIC_DECREMENT
81 #define ATOMIC_DECREMENT(counter) (--(counter))
84 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
88 #include <emmintrin.h>
90 #if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6) && !defined(__clang__)
91 #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
94 #define MM_MALLOC(size) _mm_malloc(size, ALIGN_SIZE)
96 static void *MM_CALLOC(size_t nmemb, size_t size)
98 void *ptr = _mm_malloc(nmemb*size, ALIGN_SIZE);
99 if (ptr != NULL) memset(ptr, 0, nmemb*size);
103 #define MM_FREE _mm_free
105 #define MM_MALLOC(size) malloc(size)
106 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
110 typedef enum DPSOFTRAST_ARRAY_e
112 DPSOFTRAST_ARRAY_POSITION,
113 DPSOFTRAST_ARRAY_COLOR,
114 DPSOFTRAST_ARRAY_TEXCOORD0,
115 DPSOFTRAST_ARRAY_TEXCOORD1,
116 DPSOFTRAST_ARRAY_TEXCOORD2,
117 DPSOFTRAST_ARRAY_TEXCOORD3,
118 DPSOFTRAST_ARRAY_TEXCOORD4,
119 DPSOFTRAST_ARRAY_TEXCOORD5,
120 DPSOFTRAST_ARRAY_TEXCOORD6,
121 DPSOFTRAST_ARRAY_TEXCOORD7,
122 DPSOFTRAST_ARRAY_TOTAL
126 typedef struct DPSOFTRAST_Texture_s
133 DPSOFTRAST_TEXTURE_FILTER filter;
136 ATOMIC_COUNTER binds;
137 unsigned char *bytes;
138 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
142 #define COMMAND_SIZE ALIGN_SIZE
143 #define COMMAND_ALIGN(var) ALIGN(var)
145 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
147 unsigned char opcode;
148 unsigned short commandsize;
152 enum { DPSOFTRAST_OPCODE_Reset = 0 };
154 #define DEFCOMMAND(opcodeval, name, fields) \
155 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
156 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
158 unsigned char opcode; \
159 unsigned short commandsize; \
161 } DPSOFTRAST_Command_##name );
163 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
164 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
166 typedef ALIGN(struct DPSOFTRAST_State_Command_Pool_s
170 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
172 DPSOFTRAST_State_Command_Pool);
174 typedef ALIGN(struct DPSOFTRAST_State_Triangle_s
176 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
178 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
180 DPSOFTRAST_State_Triangle);
182 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
183 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
184 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
185 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
186 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
188 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
189 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
190 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
191 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
192 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
193 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
194 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
195 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
196 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
199 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
201 typedef ALIGN(struct DPSOFTRAST_State_Span_s
203 int triangle; // triangle this span was generated by
204 int x; // framebuffer x coord
205 int y; // framebuffer y coord
206 int startx; // usable range (according to pixelmask)
207 int endx; // usable range (according to pixelmask)
208 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
209 int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
210 int depthslope; // depthbuffer value pixel delta
212 DPSOFTRAST_State_Span);
214 #define DPSOFTRAST_DRAW_MAXSPANS 1024
215 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
216 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
218 #define DPSOFTRAST_VALIDATE_FB 1
219 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
220 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
221 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
223 typedef enum DPSOFTRAST_BLENDMODE_e
225 DPSOFTRAST_BLENDMODE_OPAQUE,
226 DPSOFTRAST_BLENDMODE_ALPHA,
227 DPSOFTRAST_BLENDMODE_ADDALPHA,
228 DPSOFTRAST_BLENDMODE_ADD,
229 DPSOFTRAST_BLENDMODE_INVMOD,
230 DPSOFTRAST_BLENDMODE_MUL,
231 DPSOFTRAST_BLENDMODE_MUL2,
232 DPSOFTRAST_BLENDMODE_SUBALPHA,
233 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
234 DPSOFTRAST_BLENDMODE_INVADD,
235 DPSOFTRAST_BLENDMODE_TOTAL
237 DPSOFTRAST_BLENDMODE;
239 typedef ALIGN(struct DPSOFTRAST_State_Thread_s
255 float polygonoffset[2];
257 ALIGN(float fb_clipplane[4]);
260 int shader_permutation;
261 int shader_exactspecularmath;
263 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
265 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
266 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
268 // DPSOFTRAST_VALIDATE_ flags
271 // derived values (DPSOFTRAST_VALIDATE_FB)
274 ALIGN(float fb_viewportcenter[4]);
275 ALIGN(float fb_viewportscale[4]);
277 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
280 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
289 ATOMIC(volatile int commandoffset);
291 volatile bool waiting;
292 volatile bool starving;
299 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
300 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
301 unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
303 DPSOFTRAST_State_Thread);
305 typedef ALIGN(struct DPSOFTRAST_State_s
309 unsigned int *fb_depthpixels;
310 unsigned int *fb_colorpixels[4];
313 ALIGN(float fb_viewportcenter[4]);
314 ALIGN(float fb_viewportscale[4]);
317 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
318 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
320 const float *pointer_vertex3f;
321 const float *pointer_color4f;
322 const unsigned char *pointer_color4ub;
323 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
326 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
327 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
328 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
332 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
333 float *screencoord4f;
339 int shader_permutation;
340 int shader_exactspecularmath;
344 int texture_firstfree;
345 DPSOFTRAST_Texture *texture;
350 const char *errorstring;
355 DPSOFTRAST_State_Thread *threads;
357 ATOMIC(volatile int drawcommand);
359 DPSOFTRAST_State_Command_Pool commandpool;
363 DPSOFTRAST_State dpsoftrast;
365 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
366 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
367 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
368 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
370 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
371 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
373 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
375 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
376 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
377 fb_viewportcenter[3] = 0.5f;
378 fb_viewportcenter[0] = 0.0f;
379 fb_viewportscale[1] = 0.5f * viewport[2];
380 fb_viewportscale[2] = -0.5f * viewport[3];
381 fb_viewportscale[3] = 0.5f;
382 fb_viewportscale[0] = 1.0f;
385 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
387 if (dpsoftrast.interlace)
389 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
390 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
391 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
392 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
396 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
397 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
401 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
403 thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
404 thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
405 thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
406 thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
407 thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
410 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
412 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
413 // and viewport projection values
416 x1 = thread->scissor[0];
417 x2 = thread->scissor[0] + thread->scissor[2];
418 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
419 y2 = dpsoftrast.fb_height - thread->scissor[1];
420 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
422 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
424 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
425 thread->fb_scissor[0] = x1;
426 thread->fb_scissor[1] = y1;
427 thread->fb_scissor[2] = x2 - x1;
428 thread->fb_scissor[3] = y2 - y1;
430 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
431 DPSOFTRAST_RecalcClipPlane(thread);
432 DPSOFTRAST_RecalcThread(thread);
435 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
437 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
440 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
442 if (thread->blendsubtract)
444 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
446 #define BLENDFUNC(sfactor, dfactor, blendmode) \
447 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
448 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
449 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
454 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
456 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
457 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
458 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
459 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
460 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
461 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
462 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
463 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
464 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
465 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
466 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
471 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
473 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
475 mask &= thread->validate;
478 if (mask & DPSOFTRAST_VALIDATE_FB)
480 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
481 DPSOFTRAST_RecalcFB(thread);
483 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
485 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
486 DPSOFTRAST_RecalcDepthFunc(thread);
488 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
490 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
491 DPSOFTRAST_RecalcBlendFunc(thread);
495 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
497 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
498 return &dpsoftrast.texture[index];
502 static void DPSOFTRAST_Texture_Grow(void)
504 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
505 DPSOFTRAST_State_Thread *thread;
509 // expand texture array as needed
510 if (dpsoftrast.texture_max < 1024)
511 dpsoftrast.texture_max = 1024;
513 dpsoftrast.texture_max *= 2;
514 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
515 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
516 if (dpsoftrast.texbound[i])
517 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
518 for (j = 0; j < dpsoftrast.numthreads; j++)
520 thread = &dpsoftrast.threads[j];
521 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
522 if (thread->texbound[i])
523 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
527 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
536 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
537 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
538 DPSOFTRAST_Texture *texture;
539 if (width*height*depth < 1)
541 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
544 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
546 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
551 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
552 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
553 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
555 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
556 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
558 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
563 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
566 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
568 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
573 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
575 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
578 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
580 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
583 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
585 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
588 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
590 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
593 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
595 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
598 // find first empty slot in texture array
599 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
600 if (!dpsoftrast.texture[texnum].bytes)
602 dpsoftrast.texture_firstfree = texnum + 1;
603 if (dpsoftrast.texture_max <= texnum)
604 DPSOFTRAST_Texture_Grow();
605 if (dpsoftrast.texture_end <= texnum)
606 dpsoftrast.texture_end = texnum + 1;
607 texture = &dpsoftrast.texture[texnum];
608 memset(texture, 0, sizeof(*texture));
609 texture->flags = flags;
610 texture->width = width;
611 texture->height = height;
612 texture->depth = depth;
613 texture->sides = sides;
625 s = w * h * d * sides * 4;
626 texture->mipmap[mipmaps][0] = size;
627 texture->mipmap[mipmaps][1] = s;
628 texture->mipmap[mipmaps][2] = w;
629 texture->mipmap[mipmaps][3] = h;
630 texture->mipmap[mipmaps][4] = d;
633 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
639 texture->mipmaps = mipmaps;
640 texture->size = size;
642 // allocate the pixels now
643 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
647 void DPSOFTRAST_Texture_Free(int index)
649 DPSOFTRAST_Texture *texture;
650 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
654 MM_FREE(texture->bytes);
655 texture->bytes = NULL;
656 memset(texture, 0, sizeof(*texture));
657 // adjust the free range and used range
658 if (dpsoftrast.texture_firstfree > index)
659 dpsoftrast.texture_firstfree = index;
660 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
661 dpsoftrast.texture_end--;
663 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
665 int i, x, y, z, w, layer0, layer1, row0, row1;
666 unsigned char *o, *i0, *i1, *i2, *i3;
667 DPSOFTRAST_Texture *texture;
668 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
669 if (texture->mipmaps <= 1)
671 for (i = 1;i < texture->mipmaps;i++)
673 for (z = 0;z < texture->mipmap[i][4];z++)
677 if (layer1 >= texture->mipmap[i-1][4])
678 layer1 = texture->mipmap[i-1][4]-1;
679 for (y = 0;y < texture->mipmap[i][3];y++)
683 if (row1 >= texture->mipmap[i-1][3])
684 row1 = texture->mipmap[i-1][3]-1;
685 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
686 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
687 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
688 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
689 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
690 w = texture->mipmap[i][2];
693 if (texture->mipmap[i-1][2] > 1)
695 // average 3D texture
696 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
698 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
699 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
700 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
701 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
706 // average 3D mipmap with parent width == 1
707 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
709 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
710 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
711 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
712 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
718 if (texture->mipmap[i-1][2] > 1)
720 // average 2D texture (common case)
721 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
723 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
724 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
725 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
726 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
731 // 2D texture with parent width == 1
732 o[0] = (i0[0] + i1[0] + 1) >> 1;
733 o[1] = (i0[1] + i1[1] + 1) >> 1;
734 o[2] = (i0[2] + i1[2] + 1) >> 1;
735 o[3] = (i0[3] + i1[3] + 1) >> 1;
742 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
744 DPSOFTRAST_Texture *texture;
746 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
751 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
752 while (blockheight > 0)
754 memcpy(dst, pixels, blockwidth * 4);
755 pixels += blockwidth * 4;
756 dst += texture->mipmap[0][2] * 4;
760 DPSOFTRAST_Texture_CalculateMipmaps(index);
762 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
764 DPSOFTRAST_Texture *texture;
765 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
769 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
770 DPSOFTRAST_Texture_CalculateMipmaps(index);
772 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
774 DPSOFTRAST_Texture *texture;
775 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
776 return texture->mipmap[mip][2];
778 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
780 DPSOFTRAST_Texture *texture;
781 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
782 return texture->mipmap[mip][3];
784 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
786 DPSOFTRAST_Texture *texture;
787 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
788 return texture->mipmap[mip][4];
790 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
792 DPSOFTRAST_Texture *texture;
793 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
796 return texture->bytes + texture->mipmap[mip][0];
798 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
800 DPSOFTRAST_Texture *texture;
801 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
802 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
804 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
809 texture->filter = filter;
812 static void DPSOFTRAST_Draw_FlushThreads(void);
814 static void DPSOFTRAST_Draw_SyncCommands(void)
816 if(dpsoftrast.usethreads) MEMORY_BARRIER;
817 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
820 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
822 DPSOFTRAST_State_Thread *thread;
824 int freecommand = dpsoftrast.commandpool.freecommand;
825 int usedcommands = dpsoftrast.commandpool.usedcommands;
826 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
828 DPSOFTRAST_Draw_SyncCommands();
834 for (i = 0; i < dpsoftrast.numthreads; i++)
836 thread = &dpsoftrast.threads[i];
837 commandoffset = freecommand - thread->commandoffset;
838 if (commandoffset < 0)
839 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
840 if (commandoffset > usedcommands)
843 usedcommands = commandoffset;
846 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
848 thread = &dpsoftrast.threads[waitindex];
849 Thread_LockMutex(thread->drawmutex);
850 if (thread->commandoffset != dpsoftrast.drawcommand)
852 thread->waiting = true;
853 if (thread->starving) Thread_CondSignal(thread->drawcond);
854 Thread_CondWait(thread->waitcond, thread->drawmutex);
855 thread->waiting = false;
857 Thread_UnlockMutex(thread->drawmutex);
859 dpsoftrast.commandpool.usedcommands = usedcommands;
862 #define DPSOFTRAST_ALIGNCOMMAND(size) \
863 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
864 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
865 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
867 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
869 DPSOFTRAST_Command *command;
870 int freecommand = dpsoftrast.commandpool.freecommand;
871 int usedcommands = dpsoftrast.commandpool.usedcommands;
872 int extra = sizeof(DPSOFTRAST_Command);
873 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
874 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
875 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
877 if (dpsoftrast.usethreads)
878 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
880 DPSOFTRAST_Draw_FlushThreads();
881 freecommand = dpsoftrast.commandpool.freecommand;
882 usedcommands = dpsoftrast.commandpool.usedcommands;
884 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
886 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
887 command->opcode = DPSOFTRAST_OPCODE_Reset;
888 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
891 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
892 command->opcode = opcode;
893 command->commandsize = size;
895 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
897 dpsoftrast.commandpool.freecommand = freecommand;
898 dpsoftrast.commandpool.usedcommands = usedcommands + size;
902 static void DPSOFTRAST_UndoCommand(int size)
904 int freecommand = dpsoftrast.commandpool.freecommand;
905 int usedcommands = dpsoftrast.commandpool.usedcommands;
908 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
909 usedcommands -= size;
910 dpsoftrast.commandpool.freecommand = freecommand;
911 dpsoftrast.commandpool.usedcommands = usedcommands;
914 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
915 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
917 thread->viewport[0] = command->x;
918 thread->viewport[1] = command->y;
919 thread->viewport[2] = command->width;
920 thread->viewport[3] = command->height;
921 thread->validate |= DPSOFTRAST_VALIDATE_FB;
923 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
925 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
928 command->width = width;
929 command->height = height;
931 dpsoftrast.viewport[0] = x;
932 dpsoftrast.viewport[1] = y;
933 dpsoftrast.viewport[2] = width;
934 dpsoftrast.viewport[3] = height;
935 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
938 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
939 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
941 int i, x1, y1, x2, y2, w, h, x, y;
942 int miny1, maxy1, miny2, maxy2;
946 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
947 miny1 = thread->miny1;
948 maxy1 = thread->maxy1;
949 miny2 = thread->miny2;
950 maxy2 = thread->maxy2;
951 x1 = thread->fb_scissor[0];
952 y1 = thread->fb_scissor[1];
953 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
954 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
955 if (y1 < miny1) y1 = miny1;
956 if (y2 > maxy2) y2 = maxy2;
961 // FIXME: honor fb_colormask?
962 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
963 for (i = 0;i < 4;i++)
965 if (!dpsoftrast.fb_colorpixels[i])
967 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
970 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
971 for (x = x1;x < x2;x++)
976 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
978 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
985 DEFCOMMAND(3, ClearDepth, float depth;)
986 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
988 int x1, y1, x2, y2, w, h, x, y;
989 int miny1, maxy1, miny2, maxy2;
993 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
994 miny1 = thread->miny1;
995 maxy1 = thread->maxy1;
996 miny2 = thread->miny2;
997 maxy2 = thread->maxy2;
998 x1 = thread->fb_scissor[0];
999 y1 = thread->fb_scissor[1];
1000 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
1001 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
1002 if (y1 < miny1) y1 = miny1;
1003 if (y2 > maxy2) y2 = maxy2;
1008 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
1009 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
1010 for (;y < bandy;y++)
1012 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
1013 for (x = x1;x < x2;x++)
1017 void DPSOFTRAST_ClearDepth(float d)
1019 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1023 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1024 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1026 thread->colormask[0] = command->r != 0;
1027 thread->colormask[1] = command->g != 0;
1028 thread->colormask[2] = command->b != 0;
1029 thread->colormask[3] = command->a != 0;
1030 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1032 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1034 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1041 DEFCOMMAND(5, DepthTest, int enable;)
1042 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1044 thread->depthtest = command->enable;
1045 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1047 void DPSOFTRAST_DepthTest(int enable)
1049 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1050 command->enable = enable;
1053 DEFCOMMAND(6, ScissorTest, int enable;)
1054 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1056 thread->scissortest = command->enable;
1057 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1059 void DPSOFTRAST_ScissorTest(int enable)
1061 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1062 command->enable = enable;
1065 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1066 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1068 thread->scissor[0] = command->x;
1069 thread->scissor[1] = command->y;
1070 thread->scissor[2] = command->width;
1071 thread->scissor[3] = command->height;
1072 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1074 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1076 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1079 command->width = width;
1080 command->height = height;
1083 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1084 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1086 thread->blendfunc[0] = command->sfactor;
1087 thread->blendfunc[1] = command->dfactor;
1088 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1090 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1092 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1093 command->sfactor = sfactor;
1094 command->dfactor = dfactor;
1097 DEFCOMMAND(9, BlendSubtract, int enable;)
1098 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1100 thread->blendsubtract = command->enable;
1101 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1103 void DPSOFTRAST_BlendSubtract(int enable)
1105 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1106 command->enable = enable;
1109 DEFCOMMAND(10, DepthMask, int enable;)
1110 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1112 thread->depthmask = command->enable;
1114 void DPSOFTRAST_DepthMask(int enable)
1116 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1117 command->enable = enable;
1120 DEFCOMMAND(11, DepthFunc, int func;)
1121 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1123 thread->depthfunc = command->func;
1125 void DPSOFTRAST_DepthFunc(int func)
1127 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1128 command->func = func;
1131 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1132 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1134 thread->depthrange[0] = command->nearval;
1135 thread->depthrange[1] = command->farval;
1137 void DPSOFTRAST_DepthRange(float nearval, float farval)
1139 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1140 command->nearval = nearval;
1141 command->farval = farval;
1144 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1145 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1147 thread->polygonoffset[0] = command->alongnormal;
1148 thread->polygonoffset[1] = command->intoview;
1150 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1152 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1153 command->alongnormal = alongnormal;
1154 command->intoview = intoview;
1157 DEFCOMMAND(14, CullFace, int mode;)
1158 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1160 thread->cullface = command->mode;
1162 void DPSOFTRAST_CullFace(int mode)
1164 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1165 command->mode = mode;
1168 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1170 dpsoftrast.color[0] = r;
1171 dpsoftrast.color[1] = g;
1172 dpsoftrast.color[2] = b;
1173 dpsoftrast.color[3] = a;
1176 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1178 int outstride = blockwidth * 4;
1179 int instride = dpsoftrast.fb_width * 4;
1182 int bx2 = blockx + blockwidth;
1183 int by2 = blocky + blockheight;
1187 unsigned char *inpixels;
1191 if (bx1 < 0) bx1 = 0;
1192 if (by1 < 0) by1 = 0;
1193 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1194 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1196 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1197 if (dpsoftrast.bigendian)
1199 for (y = by1;y < by2;y++)
1201 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1202 o = (unsigned char *)outpixels + (y - by1) * outstride;
1203 for (x = bx1;x < bx2;x++)
1216 for (y = by1;y < by2;y++)
1218 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1219 o = (unsigned char *)outpixels + (y - by1) * outstride;
1225 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1229 int tx2 = tx + width;
1230 int ty2 = ty + height;
1233 int sx2 = sx + width;
1234 int sy2 = sy + height;
1244 unsigned int *spixels;
1245 unsigned int *tpixels;
1246 DPSOFTRAST_Texture *texture;
1247 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1248 if (mip < 0 || mip >= texture->mipmaps) return;
1250 spixels = dpsoftrast.fb_colorpixels[0];
1251 swidth = dpsoftrast.fb_width;
1252 sheight = dpsoftrast.fb_height;
1253 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1254 twidth = texture->mipmap[mip][2];
1255 theight = texture->mipmap[mip][3];
1256 if (tx1 < 0) tx1 = 0;
1257 if (ty1 < 0) ty1 = 0;
1258 if (tx2 > twidth) tx2 = twidth;
1259 if (ty2 > theight) ty2 = theight;
1260 if (sx1 < 0) sx1 = 0;
1261 if (sy1 < 0) sy1 = 0;
1262 if (sx2 > swidth) sx2 = swidth;
1263 if (sy2 > sheight) sy2 = sheight;
1268 if (tw > sw) tw = sw;
1269 if (th > sh) th = sh;
1270 if (tw < 1 || th < 1)
1272 sy1 = sheight - 1 - sy1;
1273 for (y = 0;y < th;y++)
1274 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1275 if (texture->mipmaps > 1)
1276 DPSOFTRAST_Texture_CalculateMipmaps(index);
1279 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1280 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1282 if (thread->texbound[command->unitnum])
1283 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1284 thread->texbound[command->unitnum] = command->texture;
1286 void DPSOFTRAST_SetTexture(int unitnum, int index)
1288 DPSOFTRAST_Command_SetTexture *command;
1289 DPSOFTRAST_Texture *texture;
1290 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1292 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1295 texture = DPSOFTRAST_Texture_GetByIndex(index);
1296 if (index && !texture)
1298 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1302 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1303 command->unitnum = unitnum;
1304 command->texture = texture;
1306 dpsoftrast.texbound[unitnum] = texture;
1307 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1310 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1312 dpsoftrast.pointer_vertex3f = vertex3f;
1313 dpsoftrast.stride_vertex = stride;
1315 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1317 dpsoftrast.pointer_color4f = color4f;
1318 dpsoftrast.pointer_color4ub = NULL;
1319 dpsoftrast.stride_color = stride;
1321 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1323 dpsoftrast.pointer_color4f = NULL;
1324 dpsoftrast.pointer_color4ub = color4ub;
1325 dpsoftrast.stride_color = stride;
1327 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1329 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1330 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1331 dpsoftrast.stride_texcoord[unitnum] = stride;
1334 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1335 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1337 thread->shader_mode = command->mode;
1338 thread->shader_permutation = command->permutation;
1339 thread->shader_exactspecularmath = command->exactspecularmath;
1341 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1343 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1344 command->mode = mode;
1345 command->permutation = permutation;
1346 command->exactspecularmath = exactspecularmath;
1348 dpsoftrast.shader_mode = mode;
1349 dpsoftrast.shader_permutation = permutation;
1350 dpsoftrast.shader_exactspecularmath = exactspecularmath;
1353 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1354 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1356 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1358 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1360 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1361 command->index = index;
1362 command->val[0] = v0;
1363 command->val[1] = v1;
1364 command->val[2] = v2;
1365 command->val[3] = v3;
1367 dpsoftrast.uniform4f[index*4+0] = v0;
1368 dpsoftrast.uniform4f[index*4+1] = v1;
1369 dpsoftrast.uniform4f[index*4+2] = v2;
1370 dpsoftrast.uniform4f[index*4+3] = v3;
1372 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1374 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1375 command->index = index;
1376 memcpy(command->val, v, sizeof(command->val));
1378 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1381 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1382 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1384 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1386 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1390 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1392 __m128 m0, m1, m2, m3;
1393 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1394 command->index = (DPSOFTRAST_UNIFORM)index;
1395 if (((size_t)v)&(ALIGN_SIZE-1))
1397 m0 = _mm_loadu_ps(v);
1398 m1 = _mm_loadu_ps(v+4);
1399 m2 = _mm_loadu_ps(v+8);
1400 m3 = _mm_loadu_ps(v+12);
1404 m0 = _mm_load_ps(v);
1405 m1 = _mm_load_ps(v+4);
1406 m2 = _mm_load_ps(v+8);
1407 m3 = _mm_load_ps(v+12);
1411 __m128 t0, t1, t2, t3;
1412 t0 = _mm_unpacklo_ps(m0, m1);
1413 t1 = _mm_unpacklo_ps(m2, m3);
1414 t2 = _mm_unpackhi_ps(m0, m1);
1415 t3 = _mm_unpackhi_ps(m2, m3);
1416 m0 = _mm_movelh_ps(t0, t1);
1417 m1 = _mm_movehl_ps(t1, t0);
1418 m2 = _mm_movelh_ps(t2, t3);
1419 m3 = _mm_movehl_ps(t3, t2);
1421 _mm_store_ps(command->val, m0);
1422 _mm_store_ps(command->val+4, m1);
1423 _mm_store_ps(command->val+8, m2);
1424 _mm_store_ps(command->val+12, m3);
1425 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1426 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1427 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1428 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1433 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1434 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1436 thread->uniform1i[command->index] = command->val;
1438 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1440 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1441 command->index = index;
1444 dpsoftrast.uniform1i[command->index] = i0;
1447 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1448 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1450 memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1451 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1453 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1455 DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1456 command->clipplane[0] = x;
1457 command->clipplane[1] = y;
1458 command->clipplane[2] = z;
1459 command->clipplane[3] = w;
1463 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1465 float *end = dst + size*4;
1466 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1470 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1479 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1486 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1488 float *end = dst + size*4;
1489 if (stride == sizeof(float[3]))
1491 float *end4 = dst + (size&~3)*4;
1492 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1496 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1497 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1498 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1499 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1500 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1501 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1502 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1503 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1504 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1505 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1506 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1507 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1508 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1510 src += 4*sizeof(float[3]);
1517 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1518 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1519 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1520 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1521 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1522 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1523 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1524 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1525 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1526 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1527 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1528 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1529 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1531 src += 4*sizeof(float[3]);
1535 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1539 __m128 v = _mm_loadu_ps((const float *)src);
1540 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1541 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1542 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1543 _mm_store_ps(dst, v);
1552 __m128 v = _mm_load_ps((const float *)src);
1553 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1554 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1555 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1556 _mm_store_ps(dst, v);
1563 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1565 float *end = dst + size*4;
1566 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1567 if (stride == sizeof(float[2]))
1569 float *end2 = dst + (size&~1)*4;
1570 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1574 __m128 v = _mm_loadu_ps((const float *)src);
1575 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1576 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1578 src += 2*sizeof(float[2]);
1585 __m128 v = _mm_load_ps((const float *)src);
1586 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1587 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1589 src += 2*sizeof(float[2]);
1595 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1601 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1603 float *end = dst + size*4;
1604 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1605 if (stride == sizeof(unsigned char[4]))
1607 float *end4 = dst + (size&~3)*4;
1608 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1612 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1613 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1614 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1615 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1616 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1618 src += 4*sizeof(unsigned char[4]);
1625 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1626 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1627 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1628 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1629 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1631 src += 4*sizeof(unsigned char[4]);
1637 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1638 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1644 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1646 float *end = dst + 4*size;
1647 __m128 v = _mm_loadu_ps(src);
1650 _mm_store_ps(dst, v);
1656 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1659 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1660 __m128 m0, m1, m2, m3;
1662 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1664 // fast case for identity matrix
1665 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1668 end = out4f + numitems*4;
1669 m0 = _mm_loadu_ps(inmatrix16f);
1670 m1 = _mm_loadu_ps(inmatrix16f + 4);
1671 m2 = _mm_loadu_ps(inmatrix16f + 8);
1672 m3 = _mm_loadu_ps(inmatrix16f + 12);
1673 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1677 __m128 v = _mm_loadu_ps(in4f);
1679 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1680 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1681 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1682 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1691 __m128 v = _mm_load_ps(in4f);
1693 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1694 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1695 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1696 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1704 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1706 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1710 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1712 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1713 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1714 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1715 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1718 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1720 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1721 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1722 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1723 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1726 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1729 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1730 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1731 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1732 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1735 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1737 int clipmask = 0xFF;
1738 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1739 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1740 __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1741 __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1742 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1743 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1744 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1745 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1746 #define BBFRONT(k, pos) \
1748 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1749 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1750 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1753 clipmask &= ~(1<<k); \
1754 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1755 minproj = _mm_min_ss(minproj, proj); \
1756 maxproj = _mm_max_ss(maxproj, proj); \
1760 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1761 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1762 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1763 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1764 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1765 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1769 if (clipmask&(1<<k)) \
1771 if (!(clipmask&(1<<(k^1)))) \
1773 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1774 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1775 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1776 minproj = _mm_min_ss(minproj, proj); \
1777 maxproj = _mm_max_ss(maxproj, proj); \
1779 if (!(clipmask&(1<<(k^2)))) \
1781 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1782 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1783 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1784 minproj = _mm_min_ss(minproj, proj); \
1785 maxproj = _mm_max_ss(maxproj, proj); \
1787 if (!(clipmask&(1<<(k^4)))) \
1789 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1790 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1791 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1792 minproj = _mm_min_ss(minproj, proj); \
1793 maxproj = _mm_max_ss(maxproj, proj); \
1797 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1798 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1799 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1800 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1801 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1802 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1803 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1804 *starty = _mm_cvttss_si32(maxproj);
1805 *endy = _mm_cvttss_si32(minproj)+1;
1809 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1811 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1812 float *end = out4f + numitems*4;
1813 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1814 __m128 minpos, maxpos;
1815 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1817 minpos = maxpos = _mm_loadu_ps(in4f);
1820 __m128 v = _mm_loadu_ps(in4f);
1821 minpos = _mm_min_ps(minpos, v);
1822 maxpos = _mm_max_ps(maxpos, v);
1823 _mm_store_ps(out4f, v);
1824 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1825 _mm_store_ps(screen4f, v);
1833 minpos = maxpos = _mm_load_ps(in4f);
1836 __m128 v = _mm_load_ps(in4f);
1837 minpos = _mm_min_ps(minpos, v);
1838 maxpos = _mm_max_ps(maxpos, v);
1839 _mm_store_ps(out4f, v);
1840 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1841 _mm_store_ps(screen4f, v);
1849 ALIGN(float minposf[4]);
1850 ALIGN(float maxposf[4]);
1851 _mm_store_ps(minposf, minpos);
1852 _mm_store_ps(maxposf, maxpos);
1853 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1858 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1860 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1861 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1863 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1864 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1865 end = out4f + numitems*4;
1866 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1867 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1868 m0 = _mm_loadu_ps(inmatrix16f);
1869 m1 = _mm_loadu_ps(inmatrix16f + 4);
1870 m2 = _mm_loadu_ps(inmatrix16f + 8);
1871 m3 = _mm_loadu_ps(inmatrix16f + 12);
1872 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1874 minpos = maxpos = _mm_loadu_ps(in4f);
1877 __m128 v = _mm_loadu_ps(in4f);
1878 minpos = _mm_min_ps(minpos, v);
1879 maxpos = _mm_max_ps(maxpos, v);
1880 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1881 _mm_store_ps(out4f, v);
1882 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1883 _mm_store_ps(screen4f, v);
1891 minpos = maxpos = _mm_load_ps(in4f);
1894 __m128 v = _mm_load_ps(in4f);
1895 minpos = _mm_min_ps(minpos, v);
1896 maxpos = _mm_max_ps(maxpos, v);
1897 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1898 _mm_store_ps(out4f, v);
1899 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1900 _mm_store_ps(screen4f, v);
1908 ALIGN(float minposf[4]);
1909 ALIGN(float maxposf[4]);
1910 _mm_store_ps(minposf, minpos);
1911 _mm_store_ps(maxposf, maxpos);
1912 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f);
1918 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1921 float *outf = dpsoftrast.post_array4f[outarray];
1922 const unsigned char *inb;
1923 int firstvertex = dpsoftrast.firstvertex;
1924 int numvertices = dpsoftrast.numvertices;
1928 case DPSOFTRAST_ARRAY_POSITION:
1929 stride = dpsoftrast.stride_vertex;
1930 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1931 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1933 case DPSOFTRAST_ARRAY_COLOR:
1934 stride = dpsoftrast.stride_color;
1935 if (dpsoftrast.pointer_color4f)
1937 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1938 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1940 else if (dpsoftrast.pointer_color4ub)
1942 stride = dpsoftrast.stride_color;
1943 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1944 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1948 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1952 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1953 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1955 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1956 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1959 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1962 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1965 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1977 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1979 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1980 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1985 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1988 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1989 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1997 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2000 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2001 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2008 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2011 int startx = span->startx;
2012 int endx = span->endx;
2013 float wslope = triangle->w[0];
2014 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2015 float endz = 1.0f / (w + wslope * startx);
2016 if (triangle->w[0] == 0)
2018 // LordHavoc: fast flat polygons (HUD/menu)
2019 for (x = startx;x < endx;x++)
2023 for (x = startx;x < endx;)
2025 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2027 if (nextsub >= endx) nextsub = endsub = endx-1;
2028 endz = 1.0f / (w + wslope * nextsub);
2029 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2030 for (; x <= endsub; x++, z += dz)
2035 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2039 int startx = span->startx;
2040 int endx = span->endx;
2043 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2044 unsigned char * RESTRICT pixelmask = span->pixelmask;
2045 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2046 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2049 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2050 pixeli += span->y * dpsoftrast.fb_width + span->x;
2051 // handle alphatest now (this affects depth writes too)
2052 if (thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL)
2053 for (x = startx;x < endx;x++)
2054 if (in4ub[x*4+3] < 128)
2055 pixelmask[x] = false;
2056 // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2057 // helps sprites, text and hud artwork
2058 switch(thread->fb_blendmode)
2060 case DPSOFTRAST_BLENDMODE_ALPHA:
2061 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2062 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2064 for (x = startx;x < endx;x++)
2066 if (in4ub[x*4+3] >= 1)
2071 while (++x < endx && in4ub[x*4+3] >= 1) ;
2073 if (x >= endx) break;
2075 while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2076 if (x >= endx) break;
2083 case DPSOFTRAST_BLENDMODE_OPAQUE:
2084 case DPSOFTRAST_BLENDMODE_ADD:
2085 case DPSOFTRAST_BLENDMODE_INVMOD:
2086 case DPSOFTRAST_BLENDMODE_MUL:
2087 case DPSOFTRAST_BLENDMODE_MUL2:
2088 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2089 case DPSOFTRAST_BLENDMODE_INVADD:
2092 // put some special values at the end of the mask to ensure the loops end
2093 pixelmask[endx] = 1;
2094 pixelmask[endx+1] = 0;
2095 // LordHavoc: use a double loop to identify subspans, this helps the
2096 // optimized copy/blend loops to perform at their best, most triangles
2097 // have only one run of pixels, and do the search using wide reads...
2101 // if this pixel is masked off, it's probably not alone...
2108 // the 4-item search must be aligned or else it stalls badly
2109 if ((x & 3) && !pixelmask[x])
2111 if(pixelmask[x]) goto endmasked;
2115 if(pixelmask[x]) goto endmasked;
2119 if(pixelmask[x]) goto endmasked;
2124 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2128 for (;!pixelmask[x];x++)
2130 // rather than continue the loop, just check the end variable
2135 // find length of subspan
2138 if (subx + 8 < endx)
2142 if(!pixelmask[subx]) goto endunmasked;
2146 if(!pixelmask[subx]) goto endunmasked;
2150 if(!pixelmask[subx]) goto endunmasked;
2155 while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2159 for (;pixelmask[subx];subx++)
2161 // the checks can overshoot, so make sure to clip it...
2165 // now that we know the subspan length... process!
2166 switch(thread->fb_blendmode)
2168 case DPSOFTRAST_BLENDMODE_OPAQUE:
2172 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2177 while (x + 16 <= subx)
2179 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2180 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2181 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2182 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2187 while (x + 4 <= subx)
2189 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2195 pixeli[x+1] = ini[x+1];
2205 case DPSOFTRAST_BLENDMODE_ALPHA:
2206 #define FINISHBLEND(blend2, blend1) \
2207 for (;x + 1 < subx;x += 2) \
2210 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2211 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2213 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2218 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2219 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2221 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2225 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2226 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2228 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2229 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2232 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2234 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2235 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2237 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2238 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2241 case DPSOFTRAST_BLENDMODE_ADD:
2242 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2244 case DPSOFTRAST_BLENDMODE_INVMOD:
2246 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2248 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2251 case DPSOFTRAST_BLENDMODE_MUL:
2252 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2254 case DPSOFTRAST_BLENDMODE_MUL2:
2255 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2257 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2259 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2260 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2262 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2263 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2266 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2268 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2269 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2271 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2272 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2275 case DPSOFTRAST_BLENDMODE_INVADD:
2277 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2279 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2287 static void DPSOFTRAST_Texture2DBGRA8(DPSOFTRAST_Texture *texture, int mip, float x, float y, unsigned char c[4])
2288 // warning: this is SLOW, only use if the optimized per-span functions won't do
2290 const unsigned char * RESTRICT pixelbase;
2291 const unsigned char * RESTRICT pixel[4];
2292 int width = texture->mipmap[mip][2], height = texture->mipmap[mip][3];
2293 int wrapmask[2] = { width-1, height-1 };
2294 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2295 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
2297 unsigned int tc[2] = { x * (width<<12) - 2048, y * (height<<12) - 2048};
2298 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2299 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2300 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2301 int tci[2] = { tc[0]>>12, tc[1]>>12 };
2302 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2303 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2305 tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2306 tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2307 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= wrapmask[0] ? tci1[0] : wrapmask[0]) : 0;
2308 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= wrapmask[1] ? tci1[1] : wrapmask[1]) : 0;
2312 tci[0] &= wrapmask[0];
2313 tci[1] &= wrapmask[1];
2314 tci1[0] &= wrapmask[0];
2315 tci1[1] &= wrapmask[1];
2317 pixel[0] = pixelbase + 4 * (tci[1]*width+tci[0]);
2318 pixel[1] = pixelbase + 4 * (tci[1]*width+tci1[0]);
2319 pixel[2] = pixelbase + 4 * (tci1[1]*width+tci[0]);
2320 pixel[3] = pixelbase + 4 * (tci1[1]*width+tci1[0]);
2321 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
2322 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
2323 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
2324 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
2328 int tci[2] = { x * width, y * height };
2329 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2331 tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2332 tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2336 tci[0] &= wrapmask[0];
2337 tci[1] &= wrapmask[1];
2339 pixel[0] = pixelbase + 4 * (tci[1]*width+tci[0]);
2347 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2350 int startx = span->startx;
2351 int endx = span->endx;
2356 float tc[2], endtc[2];
2358 unsigned int tci[2];
2359 unsigned int tci1[2];
2360 unsigned int tcimin[2];
2361 unsigned int tcimax[2];
2366 const unsigned char * RESTRICT pixelbase;
2367 const unsigned char * RESTRICT pixel[4];
2368 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2369 // if no texture is bound, just fill it with white
2372 for (x = startx;x < endx;x++)
2374 out4f[x*4+0] = 1.0f;
2375 out4f[x*4+1] = 1.0f;
2376 out4f[x*4+2] = 1.0f;
2377 out4f[x*4+3] = 1.0f;
2381 mip = triangle->mip[texunitindex];
2382 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2383 // if this mipmap of the texture is 1 pixel, just fill it with that color
2384 if (texture->mipmap[mip][1] == 4)
2386 c[0] = texture->bytes[2] * (1.0f/255.0f);
2387 c[1] = texture->bytes[1] * (1.0f/255.0f);
2388 c[2] = texture->bytes[0] * (1.0f/255.0f);
2389 c[3] = texture->bytes[3] * (1.0f/255.0f);
2390 for (x = startx;x < endx;x++)
2392 out4f[x*4+0] = c[0];
2393 out4f[x*4+1] = c[1];
2394 out4f[x*4+2] = c[2];
2395 out4f[x*4+3] = c[3];
2399 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2400 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2401 flags = texture->flags;
2402 tcscale[0] = texture->mipmap[mip][2];
2403 tcscale[1] = texture->mipmap[mip][3];
2404 tciwidth = texture->mipmap[mip][2];
2407 tcimax[0] = texture->mipmap[mip][2]-1;
2408 tcimax[1] = texture->mipmap[mip][3]-1;
2409 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2410 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2411 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2412 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2418 for (x = startx;x < endx;)
2420 unsigned int subtc[2];
2421 unsigned int substep[2];
2422 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2423 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2424 if (nextsub >= endx)
2426 nextsub = endsub = endx-1;
2427 if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2431 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2432 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2438 substep[0] = (endtc[0] - tc[0]) * subscale;
2439 substep[1] = (endtc[1] - tc[1]) * subscale;
2440 subtc[0] = tc[0] * (1<<12);
2441 subtc[1] = tc[1] * (1<<12);
2444 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2446 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2448 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2449 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2450 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2451 tci[0] = subtc[0]>>12;
2452 tci[1] = subtc[1]>>12;
2453 tci1[0] = tci[0] + 1;
2454 tci1[1] = tci[1] + 1;
2455 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2456 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2457 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2458 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2459 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2460 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2461 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2462 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2463 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2464 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2465 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2466 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2467 out4f[x*4+0] = c[0];
2468 out4f[x*4+1] = c[1];
2469 out4f[x*4+2] = c[2];
2470 out4f[x*4+3] = c[3];
2475 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2477 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2478 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2479 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2480 tci[0] = subtc[0]>>12;
2481 tci[1] = subtc[1]>>12;
2482 tci1[0] = tci[0] + 1;
2483 tci1[1] = tci[1] + 1;
2484 tci[0] &= tciwrapmask[0];
2485 tci[1] &= tciwrapmask[1];
2486 tci1[0] &= tciwrapmask[0];
2487 tci1[1] &= tciwrapmask[1];
2488 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2489 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2490 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2491 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2492 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2493 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2494 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2495 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2496 out4f[x*4+0] = c[0];
2497 out4f[x*4+1] = c[1];
2498 out4f[x*4+2] = c[2];
2499 out4f[x*4+3] = c[3];
2503 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2505 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2507 tci[0] = subtc[0]>>12;
2508 tci[1] = subtc[1]>>12;
2509 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2510 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2511 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2512 c[0] = pixel[0][2] * (1.0f / 255.0f);
2513 c[1] = pixel[0][1] * (1.0f / 255.0f);
2514 c[2] = pixel[0][0] * (1.0f / 255.0f);
2515 c[3] = pixel[0][3] * (1.0f / 255.0f);
2516 out4f[x*4+0] = c[0];
2517 out4f[x*4+1] = c[1];
2518 out4f[x*4+2] = c[2];
2519 out4f[x*4+3] = c[3];
2524 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2526 tci[0] = subtc[0]>>12;
2527 tci[1] = subtc[1]>>12;
2528 tci[0] &= tciwrapmask[0];
2529 tci[1] &= tciwrapmask[1];
2530 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2531 c[0] = pixel[0][2] * (1.0f / 255.0f);
2532 c[1] = pixel[0][1] * (1.0f / 255.0f);
2533 c[2] = pixel[0][0] * (1.0f / 255.0f);
2534 c[3] = pixel[0][3] * (1.0f / 255.0f);
2535 out4f[x*4+0] = c[0];
2536 out4f[x*4+1] = c[1];
2537 out4f[x*4+2] = c[2];
2538 out4f[x*4+3] = c[3];
2544 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2548 int startx = span->startx;
2549 int endx = span->endx;
2551 __m128 data, slope, tcscale;
2552 __m128i tcsize, tcmask, tcoffset, tcmax;
2554 __m128i subtc, substep, endsubtc;
2557 int affine; // LordHavoc: optimized affine texturing case
2558 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2559 const unsigned char * RESTRICT pixelbase;
2560 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2561 // if no texture is bound, just fill it with white
2564 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2567 mip = triangle->mip[texunitindex];
2568 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2569 // if this mipmap of the texture is 1 pixel, just fill it with that color
2570 if (texture->mipmap[mip][1] == 4)
2572 unsigned int k = *((const unsigned int *)pixelbase);
2573 for (x = startx;x < endx;x++)
2577 affine = zf[startx] == zf[endx-1];
2578 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2579 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2580 flags = texture->flags;
2581 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2582 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2583 tcscale = _mm_cvtepi32_ps(tcsize);
2584 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2585 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2586 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2588 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2589 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2590 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2591 tcmax = _mm_packs_epi32(tcmask, tcmask);
2592 for (x = startx;x < endx;)
2594 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2595 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2596 if (nextsub >= endx || affine)
2598 nextsub = endsub = endx-1;
2599 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2603 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2605 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2606 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2607 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2608 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2609 substep = _mm_slli_epi32(substep, 1);
2612 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2613 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2615 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2616 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2618 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2619 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2620 tci = _mm_madd_epi16(tci, tcoffset);
2621 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2622 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2623 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2624 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2625 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2626 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2627 fracm = _mm_srli_epi16(subtc, 1);
2628 pix1 = _mm_add_epi16(pix1,
2629 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2630 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2631 pix3 = _mm_add_epi16(pix3,
2632 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2633 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2634 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2635 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2636 pix2 = _mm_add_epi16(pix2,
2637 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2638 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2639 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2643 const unsigned char * RESTRICT ptr1;
2644 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2645 tci = _mm_madd_epi16(tci, tcoffset);
2646 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2647 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2648 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2649 fracm = _mm_srli_epi16(subtc, 1);
2650 pix1 = _mm_add_epi16(pix1,
2651 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2652 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2653 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2654 pix1 = _mm_add_epi16(pix1,
2655 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2656 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2657 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2661 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2663 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2665 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2666 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2667 tci = _mm_madd_epi16(tci, tcoffset);
2668 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2669 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2670 _mm_setzero_si128());
2671 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2672 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2673 _mm_setzero_si128());
2674 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2675 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2676 tci = _mm_madd_epi16(tci, tcoffset);
2677 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2678 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2679 _mm_setzero_si128());
2680 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2681 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2682 _mm_setzero_si128());
2683 fracm = _mm_srli_epi16(subtc, 1);
2684 pix1 = _mm_add_epi16(pix1,
2685 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2686 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2687 pix3 = _mm_add_epi16(pix3,
2688 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2689 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2690 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2691 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2692 pix2 = _mm_add_epi16(pix2,
2693 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2694 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2695 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2699 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2700 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2701 tci = _mm_madd_epi16(tci, tcoffset);
2702 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2703 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2704 _mm_setzero_si128());
2705 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2706 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2707 _mm_setzero_si128());
2708 fracm = _mm_srli_epi16(subtc, 1);
2709 pix1 = _mm_add_epi16(pix1,
2710 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2711 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2712 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2713 pix1 = _mm_add_epi16(pix1,
2714 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2715 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2716 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2722 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2724 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2725 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2726 tci = _mm_madd_epi16(tci, tcoffset);
2727 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2728 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2729 _mm_setzero_si128());
2730 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2731 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2732 _mm_setzero_si128());
2733 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2734 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2735 tci = _mm_madd_epi16(tci, tcoffset);
2736 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2737 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2738 _mm_setzero_si128());
2739 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2740 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2741 _mm_setzero_si128());
2742 fracm = _mm_srli_epi16(subtc, 1);
2743 pix1 = _mm_add_epi16(pix1,
2744 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2745 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2746 pix3 = _mm_add_epi16(pix3,
2747 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2748 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2749 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2750 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2751 pix2 = _mm_add_epi16(pix2,
2752 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2753 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2754 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2758 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2759 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2760 tci = _mm_madd_epi16(tci, tcoffset);
2761 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2762 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2763 _mm_setzero_si128());
2764 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2765 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2766 _mm_setzero_si128());
2767 fracm = _mm_srli_epi16(subtc, 1);
2768 pix1 = _mm_add_epi16(pix1,
2769 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2770 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2771 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2772 pix1 = _mm_add_epi16(pix1,
2773 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2774 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2775 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2782 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2784 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2786 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2787 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2788 tci = _mm_madd_epi16(tci, tcoffset);
2789 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2790 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2794 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2795 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2796 tci = _mm_madd_epi16(tci, tcoffset);
2797 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2803 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2805 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2806 tci = _mm_and_si128(tci, tcmax);
2807 tci = _mm_madd_epi16(tci, tcoffset);
2808 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2809 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2813 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2814 tci = _mm_and_si128(tci, tcmax);
2815 tci = _mm_madd_epi16(tci, tcoffset);
2816 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2825 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2828 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2831 float DPSOFTRAST_SampleShadowmap(const float *vector)
2837 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2840 int startx = span->startx;
2841 int endx = span->endx;
2846 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2847 for (x = startx;x < endx;x++)
2850 c[0] = (data[0] + slope[0]*x) * z;
2851 c[1] = (data[1] + slope[1]*x) * z;
2852 c[2] = (data[2] + slope[2]*x) * z;
2853 c[3] = (data[3] + slope[3]*x) * z;
2854 out4f[x*4+0] = in4f[x*4+0] * c[0];
2855 out4f[x*4+1] = in4f[x*4+1] * c[1];
2856 out4f[x*4+2] = in4f[x*4+2] * c[2];
2857 out4f[x*4+3] = in4f[x*4+3] * c[3];
2861 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2864 int startx = span->startx;
2865 int endx = span->endx;
2870 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2871 for (x = startx;x < endx;x++)
2874 c[0] = (data[0] + slope[0]*x) * z;
2875 c[1] = (data[1] + slope[1]*x) * z;
2876 c[2] = (data[2] + slope[2]*x) * z;
2877 c[3] = (data[3] + slope[3]*x) * z;
2878 out4f[x*4+0] = c[0];
2879 out4f[x*4+1] = c[1];
2880 out4f[x*4+2] = c[2];
2881 out4f[x*4+3] = c[3];
2885 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2887 int x, startx = span->startx, endx = span->endx;
2888 float c[4], localcolor[4];
2889 localcolor[0] = subcolor[0];
2890 localcolor[1] = subcolor[1];
2891 localcolor[2] = subcolor[2];
2892 localcolor[3] = subcolor[3];
2893 for (x = startx;x < endx;x++)
2895 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2896 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2897 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2898 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2899 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2900 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2901 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2902 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2906 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2908 int x, startx = span->startx, endx = span->endx;
2909 for (x = startx;x < endx;x++)
2911 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2912 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2913 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2914 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2918 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2920 int x, startx = span->startx, endx = span->endx;
2921 for (x = startx;x < endx;x++)
2923 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2924 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2925 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2926 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2930 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2932 int x, startx = span->startx, endx = span->endx;
2934 for (x = startx;x < endx;x++)
2936 a = 1.0f - inb4f[x*4+3];
2938 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2939 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2940 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2941 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2945 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2947 int x, startx = span->startx, endx = span->endx;
2948 float localcolor[4], ilerp, lerp;
2949 localcolor[0] = color[0];
2950 localcolor[1] = color[1];
2951 localcolor[2] = color[2];
2952 localcolor[3] = color[3];
2953 ilerp = 1.0f - localcolor[3];
2954 lerp = localcolor[3];
2955 for (x = startx;x < endx;x++)
2957 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2958 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2959 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2960 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2966 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2970 int startx = span->startx;
2971 int endx = span->endx;
2974 __m128i submod, substep, endsubmod;
2975 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2976 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2977 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2978 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2979 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2980 for (x = startx; x < endx;)
2982 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2983 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2984 if (nextsub >= endx)
2986 nextsub = endsub = endx-1;
2987 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2991 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2992 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2993 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2994 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2995 substep = _mm_packs_epi32(substep, substep);
2996 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2998 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2999 pix = _mm_mulhi_epu16(pix, submod);
3000 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3004 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3005 pix = _mm_mulhi_epu16(pix, submod);
3006 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3013 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3017 int startx = span->startx;
3018 int endx = span->endx;
3021 __m128i submod, substep, endsubmod;
3022 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3023 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3024 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3025 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3026 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3027 for (x = startx; x < endx;)
3029 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3030 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3031 if (nextsub >= endx)
3033 nextsub = endsub = endx-1;
3034 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3038 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3039 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3040 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3041 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3042 substep = _mm_packs_epi32(substep, substep);
3043 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3045 __m128i pix = _mm_srai_epi16(submod, 4);
3046 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3050 __m128i pix = _mm_srai_epi16(submod, 4);
3051 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3058 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3061 int x, startx = span->startx, endx = span->endx;
3062 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3063 localcolor = _mm_packs_epi32(localcolor, localcolor);
3064 for (x = startx;x+2 <= endx;x+=2)
3066 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3067 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3068 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3069 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3073 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3074 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3075 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3076 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3081 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3084 int x, startx = span->startx, endx = span->endx;
3085 for (x = startx;x+2 <= endx;x+=2)
3087 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3088 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3089 pix1 = _mm_mulhi_epu16(pix1, pix2);
3090 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3094 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3095 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3096 pix1 = _mm_mulhi_epu16(pix1, pix2);
3097 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3102 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3105 int x, startx = span->startx, endx = span->endx;
3106 for (x = startx;x+2 <= endx;x+=2)
3108 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3109 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3110 pix1 = _mm_add_epi16(pix1, pix2);
3111 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3115 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3116 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3117 pix1 = _mm_add_epi16(pix1, pix2);
3118 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3123 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3126 int x, startx = span->startx, endx = span->endx;
3127 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3128 tint = _mm_packs_epi32(tint, tint);
3129 for (x = startx;x+2 <= endx;x+=2)
3131 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3132 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3133 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3134 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3138 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3139 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3140 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3141 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3146 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3149 int x, startx = span->startx, endx = span->endx;
3150 for (x = startx;x+2 <= endx;x+=2)
3152 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3153 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3154 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3155 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3156 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3160 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3161 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3162 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3163 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3164 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3169 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3172 int x, startx = span->startx, endx = span->endx;
3173 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3174 localcolor = _mm_packs_epi32(localcolor, localcolor);
3175 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3176 for (x = startx;x+2 <= endx;x+=2)
3178 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3179 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3180 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3184 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3185 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3186 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3193 void DPSOFTRAST_VertexShader_Generic(void)
3195 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3196 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3197 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3198 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3199 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3202 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3204 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3205 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3206 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3207 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3208 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3209 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3211 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3212 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3213 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3215 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3216 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3219 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3221 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3224 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3226 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3229 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3234 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3235 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3240 void DPSOFTRAST_VertexShader_PostProcess(void)
3242 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3243 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3244 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3247 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3249 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3250 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3251 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3252 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3253 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3254 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3255 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3257 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3258 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3260 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3261 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3263 // TODO: implement saturation
3265 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3267 // TODO: implement gammaramps
3269 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3274 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3276 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3279 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3281 // this is never called (because colormask is off when this shader is used)
3282 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3283 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3284 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3285 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3286 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3291 void DPSOFTRAST_VertexShader_FlatColor(void)
3293 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3294 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3297 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3300 unsigned char * RESTRICT pixelmask = span->pixelmask;
3301 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3302 int x, startx = span->startx, endx = span->endx;
3303 __m128i Color_Ambientm;
3304 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3305 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3306 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3307 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3308 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3309 if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3310 pixel = buffer_FragColorbgra8;
3311 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3312 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3313 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3314 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3315 for (x = startx;x < endx;x++)
3318 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3321 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3322 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3323 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3324 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3330 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3331 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3332 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3334 if (pixel == buffer_FragColorbgra8)
3335 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3341 void DPSOFTRAST_VertexShader_VertexColor(void)
3343 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3344 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3345 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3348 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3351 unsigned char * RESTRICT pixelmask = span->pixelmask;
3352 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3353 int x, startx = span->startx, endx = span->endx;
3354 __m128i Color_Ambientm, Color_Diffusem;
3356 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3357 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3358 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3359 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3360 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3361 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3362 if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3363 pixel = buffer_FragColorbgra8;
3364 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3365 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3366 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3367 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3368 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3369 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3370 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3371 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3372 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3373 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3374 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3375 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3376 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3377 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3379 __m128i color, mod, pix;
3380 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3383 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3384 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3385 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3386 data = _mm_add_ps(data, slope);
3387 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3388 data = _mm_add_ps(data, slope);
3389 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3390 data = _mm_add_ps(data, slope);
3391 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3392 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3393 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3394 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3395 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3396 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3402 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3403 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3404 mod = _mm_packs_epi32(mod, mod);
3405 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3406 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3408 if (pixel == buffer_FragColorbgra8)
3409 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3415 void DPSOFTRAST_VertexShader_Lightmap(void)
3417 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3418 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3419 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3422 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3425 unsigned char * RESTRICT pixelmask = span->pixelmask;
3426 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3427 int x, startx = span->startx, endx = span->endx;
3428 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3429 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3430 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3431 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3432 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3433 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3434 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3435 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3436 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3437 if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3438 pixel = buffer_FragColorbgra8;
3439 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3440 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3441 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3442 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3443 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3444 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3445 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3446 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3448 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3449 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3450 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3451 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3452 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3453 for (x = startx;x < endx;x++)
3455 __m128i color, lightmap, glow, pix;
3456 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3459 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3460 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3461 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3462 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3463 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3464 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3465 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3466 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3467 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3468 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3474 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3475 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3476 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3477 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3478 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3479 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3484 for (x = startx;x < endx;x++)
3486 __m128i color, lightmap, pix;
3487 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3490 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3491 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3492 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3493 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3494 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3495 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3496 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3502 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3503 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3504 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3505 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3508 if (pixel == buffer_FragColorbgra8)
3509 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3514 void DPSOFTRAST_VertexShader_LightDirection(void);
3515 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3517 void DPSOFTRAST_VertexShader_FakeLight(void)
3519 DPSOFTRAST_VertexShader_LightDirection();
3522 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3524 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3529 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3531 DPSOFTRAST_VertexShader_LightDirection();
3532 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3535 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3537 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3542 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3544 DPSOFTRAST_VertexShader_LightDirection();
3545 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3548 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3550 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3555 void DPSOFTRAST_VertexShader_LightDirection(void)
3558 int numvertices = dpsoftrast.numvertices;
3560 float LightVector[4];
3561 float EyePosition[4];
3562 float EyeVectorModelSpace[4];
3568 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3569 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3570 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3571 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3572 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3573 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3574 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3575 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3576 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3577 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3578 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3579 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3580 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3581 for (i = 0;i < numvertices;i++)
3583 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3584 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3585 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3586 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3587 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3588 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3589 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3590 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3591 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3592 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3593 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3594 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3595 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3596 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3597 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3598 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3599 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3600 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3601 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3602 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3603 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3604 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3605 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3606 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3607 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3608 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3609 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3610 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3611 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3613 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3616 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3617 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3618 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3619 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3620 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3621 #define DPSOFTRAST_Vector3Normalize(v)\
3624 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3635 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3637 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3638 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3639 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3640 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3641 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3642 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3643 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3644 unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3645 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3646 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3647 int x, startx = span->startx, endx = span->endx;
3648 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3649 float LightVectordata[4];
3650 float LightVectorslope[4];
3651 float EyeVectordata[4];
3652 float EyeVectorslope[4];
3653 float VectorSdata[4];
3654 float VectorSslope[4];
3655 float VectorTdata[4];
3656 float VectorTslope[4];
3657 float VectorRdata[4];
3658 float VectorRslope[4];
3660 float diffusetex[4];
3662 float surfacenormal[4];
3663 float lightnormal[4];
3664 float lightnormal_modelspace[4];
3666 float specularnormal[4];
3669 float SpecularPower;
3671 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3672 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3673 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3674 Color_Glow[3] = 0.0f;
3675 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3676 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3677 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3678 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3679 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3680 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3681 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3682 Color_Pants[3] = 0.0f;
3683 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3684 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3685 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3686 Color_Shirt[3] = 0.0f;
3687 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3688 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3689 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3691 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3692 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3694 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3696 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3698 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3700 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3701 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3702 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3703 Color_Diffuse[3] = 0.0f;
3704 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3705 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3706 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3707 LightColor[3] = 0.0f;
3708 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3709 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3710 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3711 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3712 Color_Specular[3] = 0.0f;
3713 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3714 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3715 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3717 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3719 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3720 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3721 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3722 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3723 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3725 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3727 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3728 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3730 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3732 // nothing of this needed
3736 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3739 for (x = startx;x < endx;x++)
3742 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3743 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3744 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3745 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3746 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3748 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3749 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3750 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3751 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3753 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3754 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3755 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3756 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3757 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3758 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3759 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3760 DPSOFTRAST_Vector3Normalize(surfacenormal);
3762 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3764 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3765 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3766 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3767 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3769 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3770 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3771 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3772 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3774 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3775 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3776 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3777 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3779 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3780 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3781 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3782 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3784 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3785 DPSOFTRAST_Vector3Normalize(lightnormal);
3787 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3789 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3790 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3791 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3792 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3795 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3797 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3798 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3799 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3801 float f = 1.0f / 256.0f;
3802 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3803 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3804 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3807 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3809 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3810 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3811 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3812 DPSOFTRAST_Vector3Normalize(lightnormal);
3814 LightColor[0] = 1.0;
3815 LightColor[1] = 1.0;
3816 LightColor[2] = 1.0;
3820 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3821 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3822 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3823 DPSOFTRAST_Vector3Normalize(lightnormal);
3826 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3828 if(thread->shader_exactspecularmath)
3830 // reflect lightnormal at surfacenormal, take the negative of that
3831 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3833 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3834 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3835 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3836 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3838 // dot of this and normalize(EyeVectorFogDepth.xyz)
3839 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3840 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3841 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3842 DPSOFTRAST_Vector3Normalize(eyenormal);
3844 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3848 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3849 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3850 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3851 DPSOFTRAST_Vector3Normalize(eyenormal);
3853 specularnormal[0] = lightnormal[0] + eyenormal[0];
3854 specularnormal[1] = lightnormal[1] + eyenormal[1];
3855 specularnormal[2] = lightnormal[2] + eyenormal[2];
3856 DPSOFTRAST_Vector3Normalize(specularnormal);
3858 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3861 specular = pow(specular, SpecularPower * glosstex[3]);
3862 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3864 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3865 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3866 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3867 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3871 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3872 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3873 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3874 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3877 buffer_FragColorbgra8[x*4+0] = d[0];
3878 buffer_FragColorbgra8[x*4+1] = d[1];
3879 buffer_FragColorbgra8[x*4+2] = d[2];
3880 buffer_FragColorbgra8[x*4+3] = d[3];
3883 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3885 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3886 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3887 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3888 Color_Diffuse[3] = 0.0f;
3889 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3890 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3891 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3892 LightColor[3] = 0.0f;
3893 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3895 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3897 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3898 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3899 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3900 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3901 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3903 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3905 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3906 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3908 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3910 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3914 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3917 for (x = startx;x < endx;x++)
3920 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3921 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3922 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3923 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3924 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3925 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3926 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3927 DPSOFTRAST_Vector3Normalize(surfacenormal);
3929 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3931 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3932 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3933 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3934 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3936 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3937 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3938 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3939 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3941 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3942 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3943 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3944 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3946 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3947 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3948 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3949 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3951 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3952 DPSOFTRAST_Vector3Normalize(lightnormal);
3954 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3956 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3957 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3958 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3959 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3962 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3964 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3965 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3966 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3968 float f = 1.0f / 256.0f;
3969 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3970 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3971 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3974 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3976 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3977 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3978 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3979 DPSOFTRAST_Vector3Normalize(lightnormal);
3981 LightColor[0] = 1.0;
3982 LightColor[1] = 1.0;
3983 LightColor[2] = 1.0;
3987 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3988 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3989 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3990 DPSOFTRAST_Vector3Normalize(lightnormal);
3993 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3994 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3996 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3997 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3998 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3999 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4003 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4004 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4005 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4006 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4008 buffer_FragColorbgra8[x*4+0] = d[0];
4009 buffer_FragColorbgra8[x*4+1] = d[1];
4010 buffer_FragColorbgra8[x*4+2] = d[2];
4011 buffer_FragColorbgra8[x*4+3] = d[3];
4016 for (x = startx;x < endx;x++)
4019 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4020 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4021 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4022 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4024 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4026 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4027 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4028 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4029 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4033 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4034 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4035 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4036 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4038 buffer_FragColorbgra8[x*4+0] = d[0];
4039 buffer_FragColorbgra8[x*4+1] = d[1];
4040 buffer_FragColorbgra8[x*4+2] = d[2];
4041 buffer_FragColorbgra8[x*4+3] = d[3];
4044 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4049 void DPSOFTRAST_VertexShader_LightSource(void)
4052 int numvertices = dpsoftrast.numvertices;
4053 float LightPosition[4];
4054 float LightVector[4];
4055 float LightVectorModelSpace[4];
4056 float EyePosition[4];
4057 float EyeVectorModelSpace[4];
4063 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4064 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4065 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4066 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4067 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4068 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4069 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4070 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4071 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4072 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4073 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4074 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4075 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4076 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4077 for (i = 0;i < numvertices;i++)
4079 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4080 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4081 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4082 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4083 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4084 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4085 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4086 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4087 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4088 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4089 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4090 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4091 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4092 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4093 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4094 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4095 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4096 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
4097 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4098 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4099 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4100 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4101 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4102 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4103 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4104 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4105 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4106 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4107 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4108 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4109 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4110 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4112 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4113 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4116 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4119 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4120 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4121 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4122 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4123 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4124 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4125 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4126 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4127 int x, startx = span->startx, endx = span->endx;
4128 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4129 float CubeVectordata[4];
4130 float CubeVectorslope[4];
4131 float LightVectordata[4];
4132 float LightVectorslope[4];
4133 float EyeVectordata[4];
4134 float EyeVectorslope[4];
4136 float diffusetex[4];
4138 float surfacenormal[4];
4139 float lightnormal[4];
4141 float specularnormal[4];
4144 float SpecularPower;
4145 float CubeVector[4];
4148 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4149 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4150 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4151 Color_Glow[3] = 0.0f;
4152 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4153 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4154 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4155 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4156 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4157 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4158 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4159 Color_Diffuse[3] = 0.0f;
4160 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4161 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4162 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4163 Color_Specular[3] = 0.0f;
4164 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4165 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4166 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4167 Color_Pants[3] = 0.0f;
4168 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4169 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4170 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4171 Color_Shirt[3] = 0.0f;
4172 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4173 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4174 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4175 LightColor[3] = 0.0f;
4176 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4177 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4178 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4179 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4180 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4181 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4182 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4183 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4185 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4186 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4188 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4189 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4190 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4192 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4193 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4194 for (x = startx;x < endx;x++)
4197 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4198 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4199 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4200 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4201 if (attenuation < 0.01f)
4203 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4205 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4206 if (attenuation < 0.01f)
4210 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4211 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4212 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4213 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4214 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4216 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4217 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4218 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4219 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4221 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4222 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4223 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4224 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4225 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4226 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4227 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4228 DPSOFTRAST_Vector3Normalize(surfacenormal);
4230 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4231 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4232 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4233 DPSOFTRAST_Vector3Normalize(lightnormal);
4235 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4237 if(thread->shader_exactspecularmath)
4239 // reflect lightnormal at surfacenormal, take the negative of that
4240 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4242 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4243 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4244 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4245 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4247 // dot of this and normalize(EyeVectorFogDepth.xyz)
4248 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4249 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4250 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4251 DPSOFTRAST_Vector3Normalize(eyenormal);
4253 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4257 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4258 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4259 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4260 DPSOFTRAST_Vector3Normalize(eyenormal);
4262 specularnormal[0] = lightnormal[0] + eyenormal[0];
4263 specularnormal[1] = lightnormal[1] + eyenormal[1];
4264 specularnormal[2] = lightnormal[2] + eyenormal[2];
4265 DPSOFTRAST_Vector3Normalize(specularnormal);
4267 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4269 specular = pow(specular, SpecularPower * glosstex[3]);
4271 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4273 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4274 attenuation *= (1.0f / 255.0f);
4275 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4276 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4277 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4278 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4282 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4283 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4284 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4285 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4287 buffer_FragColorbgra8[x*4+0] = d[0];
4288 buffer_FragColorbgra8[x*4+1] = d[1];
4289 buffer_FragColorbgra8[x*4+2] = d[2];
4290 buffer_FragColorbgra8[x*4+3] = d[3];
4293 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4295 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4296 for (x = startx;x < endx;x++)
4299 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4300 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4301 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4302 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4303 if (attenuation < 0.01f)
4305 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4307 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4308 if (attenuation < 0.01f)
4312 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4313 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4314 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4315 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4316 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4318 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4319 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4320 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4321 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4323 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4324 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4325 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4326 DPSOFTRAST_Vector3Normalize(surfacenormal);
4328 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4329 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4330 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4331 DPSOFTRAST_Vector3Normalize(lightnormal);
4333 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4334 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4336 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4337 attenuation *= (1.0f / 255.0f);
4338 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4339 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4340 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4341 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4345 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4346 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4347 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4348 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4350 buffer_FragColorbgra8[x*4+0] = d[0];
4351 buffer_FragColorbgra8[x*4+1] = d[1];
4352 buffer_FragColorbgra8[x*4+2] = d[2];
4353 buffer_FragColorbgra8[x*4+3] = d[3];
4358 for (x = startx;x < endx;x++)
4361 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4362 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4363 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4364 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4365 if (attenuation < 0.01f)
4367 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4369 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4370 if (attenuation < 0.01f)
4374 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4375 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4376 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4377 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4378 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4380 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4381 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4382 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4383 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4385 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4387 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4388 attenuation *= (1.0f / 255.0f);
4389 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4390 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4391 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4392 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4396 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4397 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4398 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4399 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4401 buffer_FragColorbgra8[x*4+0] = d[0];
4402 buffer_FragColorbgra8[x*4+1] = d[1];
4403 buffer_FragColorbgra8[x*4+2] = d[2];
4404 buffer_FragColorbgra8[x*4+3] = d[3];
4407 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4413 void DPSOFTRAST_VertexShader_Refraction(void)
4415 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4416 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4417 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4420 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4422 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4424 int x, startx = span->startx, endx = span->endx;
4427 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4428 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4431 float ModelViewProjectionPositiondata[4];
4432 float ModelViewProjectionPositionslope[4];
4435 float ScreenScaleRefractReflect[2];
4436 float ScreenCenterRefractReflect[2];
4437 float DistortScaleRefractReflect[2];
4438 float RefractColor[4];
4440 DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4441 if(!texture) return;
4444 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4445 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4448 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4451 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4452 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4453 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4454 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4455 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4456 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4457 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4458 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4459 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4460 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4463 for (x = startx;x < endx;x++)
4465 float SafeScreenTexCoord[2];
4466 float ScreenTexCoord[2];
4473 // " vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4474 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4476 // " vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4477 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4478 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4480 // " vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4481 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4482 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4483 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4484 DPSOFTRAST_Vector3Normalize(v);
4485 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4486 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4488 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4489 DPSOFTRAST_Texture2DBGRA8(texture, 0, ScreenTexCoord[0], ScreenTexCoord[1], c);
4491 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4492 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4493 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4494 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4497 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4502 void DPSOFTRAST_VertexShader_Water(void)
4505 int numvertices = dpsoftrast.numvertices;
4506 float EyePosition[4];
4507 float EyeVectorModelSpace[4];
4513 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4514 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4515 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4516 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4517 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4518 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4519 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4520 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4521 for (i = 0;i < numvertices;i++)
4523 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4524 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4525 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4526 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4527 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4528 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4529 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4530 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4531 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4532 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4533 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4534 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4535 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4536 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4537 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4538 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4539 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4540 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4541 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
4542 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
4543 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
4544 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
4546 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4547 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4548 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4552 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4554 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4556 int x, startx = span->startx, endx = span->endx;
4559 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4560 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4563 float ModelViewProjectionPositiondata[4];
4564 float ModelViewProjectionPositionslope[4];
4565 float EyeVectordata[4];
4566 float EyeVectorslope[4];
4569 float ScreenScaleRefractReflect[4];
4570 float ScreenCenterRefractReflect[4];
4571 float DistortScaleRefractReflect[4];
4572 float RefractColor[4];
4573 float ReflectColor[4];
4574 float ReflectFactor;
4575 float ReflectOffset;
4577 DPSOFTRAST_Texture *texture_refraction = thread->texbound[GL20TU_REFRACTION];
4578 DPSOFTRAST_Texture *texture_reflection = thread->texbound[GL20TU_REFLECTION];
4579 if(!texture_refraction || !texture_reflection) return;
4582 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4583 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4586 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4587 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
4590 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4591 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4592 ScreenScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+2];
4593 ScreenScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+3];
4594 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4595 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4596 ScreenCenterRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+2];
4597 ScreenCenterRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+3];
4598 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4599 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4600 DistortScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+2];
4601 DistortScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+3];
4602 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4603 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4604 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4605 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4606 ReflectColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+2];
4607 ReflectColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+1];
4608 ReflectColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+0];
4609 ReflectColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+3];
4610 ReflectFactor = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectFactor*4+0];
4611 ReflectOffset = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectOffset*4+0];
4614 for (x = startx;x < endx;x++)
4616 float SafeScreenTexCoord[4];
4617 float ScreenTexCoord[4];
4620 unsigned char c1[4];
4621 unsigned char c2[4];
4626 // " vec4 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect * (1.0 / ModelViewProjectionPosition.w);\n"
4627 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4629 // " vec4 SafeScreenTexCoord = ModelViewProjectionPosition.xyxy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect;\n"
4630 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4631 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4632 SafeScreenTexCoord[2] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[2] + ScreenCenterRefractReflect[2]; // * z (disappears)
4633 SafeScreenTexCoord[3] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[3] + ScreenCenterRefractReflect[3]; // * z (disappears)
4635 // " vec4 ScreenTexCoord = SafeScreenTexCoord + vec2(normalize(vec3(dp_texture2D(Texture_Normal, TexCoord)) - vec3(0.5))).xyxy * DistortScaleRefractReflect;\n"
4636 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4637 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4638 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4639 DPSOFTRAST_Vector3Normalize(v);
4640 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4641 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4642 ScreenTexCoord[2] = SafeScreenTexCoord[2] + v[0] * DistortScaleRefractReflect[2];
4643 ScreenTexCoord[3] = SafeScreenTexCoord[3] + v[1] * DistortScaleRefractReflect[3];
4645 // " float Fresnel = pow(min(1.0, 1.0 - float(normalize(EyeVector).z)), 2.0) * ReflectFactor + ReflectOffset;\n"
4646 v[0] = (EyeVectordata[0] + EyeVectorslope[0] * x); // * z (disappears)
4647 v[1] = (EyeVectordata[1] + EyeVectorslope[1] * x); // * z (disappears)
4648 v[2] = (EyeVectordata[2] + EyeVectorslope[2] * x); // * z (disappears)
4649 DPSOFTRAST_Vector3Normalize(v);
4650 Fresnel = 1.0f - v[2];
4651 Fresnel = min(1.0f, Fresnel);
4652 Fresnel = Fresnel * Fresnel * ReflectFactor + ReflectOffset;
4654 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4655 // " dp_FragColor = mix(vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord.xy).rgb, 1) * RefractColor, vec4(dp_texture2D(Texture_Reflection, ScreenTexCoord.zw).rgb, 1) * ReflectColor, Fresnel);\n"
4656 DPSOFTRAST_Texture2DBGRA8(texture_refraction, 0, ScreenTexCoord[0], ScreenTexCoord[1], c1);
4657 DPSOFTRAST_Texture2DBGRA8(texture_reflection, 0, ScreenTexCoord[2], ScreenTexCoord[3], c2);
4659 buffer_FragColorbgra8[x*4+0] = (c1[0] * RefractColor[0]) * (1.0f - Fresnel) + (c2[0] * ReflectColor[0]) * Fresnel;
4660 buffer_FragColorbgra8[x*4+1] = (c1[1] * RefractColor[1]) * (1.0f - Fresnel) + (c2[1] * ReflectColor[1]) * Fresnel;
4661 buffer_FragColorbgra8[x*4+2] = (c1[2] * RefractColor[2]) * (1.0f - Fresnel) + (c2[2] * ReflectColor[2]) * Fresnel;
4662 buffer_FragColorbgra8[x*4+3] = min(( RefractColor[3] * (1.0f - Fresnel) + ReflectColor[3] * Fresnel) * 256, 255);
4665 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4670 void DPSOFTRAST_VertexShader_ShowDepth(void)
4672 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4675 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4678 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4679 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4680 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4681 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4682 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4687 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4689 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4692 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4695 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4696 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4697 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4698 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4699 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4704 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4706 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4709 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4712 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4713 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4714 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4715 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4716 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4721 typedef struct DPSOFTRAST_ShaderModeInfo_s
4724 void (*Vertex)(void);
4725 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4726 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4727 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4729 DPSOFTRAST_ShaderModeInfo;
4731 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4733 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4734 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4735 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4736 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4737 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4738 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4739 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4740 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4741 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4742 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4743 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4744 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4745 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_NORMAL, GL20TU_REFLECTION, GL20TU_REFRACTION, ~0}},
4746 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4747 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4748 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4751 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4756 unsigned int *depthpixel;
4760 unsigned char *pixelmask;
4761 DPSOFTRAST_State_Triangle *triangle;
4762 triangle = &thread->triangles[span->triangle];
4763 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4764 startx = span->startx;
4766 depth = span->depthbase;
4767 depthslope = span->depthslope;
4768 pixelmask = thread->pixelmaskarray;
4769 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4771 switch(thread->fb_depthfunc)
4774 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4775 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4776 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4777 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4778 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4779 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4780 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4782 while (startx < endx && !pixelmask[startx])
4784 while (endx > startx && !pixelmask[endx-1])
4789 // no depth testing means we're just dealing with color...
4790 memset(pixelmask + startx, 1, endx - startx);
4792 span->pixelmask = pixelmask;
4793 span->startx = startx;
4797 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4799 int x, d, depth, depthslope, startx, endx;
4800 const unsigned char *pixelmask;
4801 unsigned int *depthpixel;
4802 if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4804 depth = span->depthbase;
4805 depthslope = span->depthslope;
4806 pixelmask = span->pixelmask;
4807 startx = span->startx;
4809 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4810 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4816 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4819 DPSOFTRAST_State_Triangle *triangle;
4820 DPSOFTRAST_State_Span *span;
4821 for (i = 0; i < thread->numspans; i++)
4823 span = &thread->spans[i];
4824 triangle = &thread->triangles[span->triangle];
4825 DPSOFTRAST_Draw_DepthTest(thread, span);
4826 if (span->startx >= span->endx)
4828 // run pixel shader if appropriate
4829 // do this before running depthmask code, to allow the pixelshader
4830 // to clear pixelmask values for alpha testing
4831 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4832 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4833 DPSOFTRAST_Draw_DepthWrite(thread, span);
4835 thread->numspans = 0;
4838 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4840 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4843 int cullface = thread->cullface;
4844 int minx, maxx, miny, maxy;
4845 int miny1, maxy1, miny2, maxy2;
4846 __m128i fbmin, fbmax;
4847 __m128 viewportcenter, viewportscale;
4848 int firstvertex = command->firstvertex;
4849 int numvertices = command->numvertices;
4850 int numtriangles = command->numtriangles;
4851 const int *element3i = command->element3i;
4852 const unsigned short *element3s = command->element3s;
4853 int clipped = command->clipped;
4860 int starty, endy, bandy;
4864 float clip0origin, clip0slope;
4866 __m128 triangleedge1, triangleedge2, trianglenormal;
4869 DPSOFTRAST_State_Triangle *triangle;
4870 DPSOFTRAST_Texture *texture;
4871 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4872 miny = thread->fb_scissor[1];
4873 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4874 miny1 = bound(miny, thread->miny1, maxy);
4875 maxy1 = bound(miny, thread->maxy1, maxy);
4876 miny2 = bound(miny, thread->miny2, maxy);
4877 maxy2 = bound(miny, thread->maxy2, maxy);
4878 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4880 if (!ATOMIC_DECREMENT(command->refcount))
4882 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4883 MM_FREE(command->arrays);
4887 minx = thread->fb_scissor[0];
4888 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4889 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4890 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4891 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4892 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4893 screen[3] = _mm_setzero_ps();
4894 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4895 for (i = 0;i < numtriangles;i++)
4897 const float *screencoord4f = command->arrays;
4898 const float *arrays = screencoord4f + numvertices*4;
4900 // generate the 3 edges of this triangle
4901 // generate spans for the triangle - switch based on left split or right split classification of triangle
4904 e[0] = element3s[i*3+0] - firstvertex;
4905 e[1] = element3s[i*3+1] - firstvertex;
4906 e[2] = element3s[i*3+2] - firstvertex;
4910 e[0] = element3i[i*3+0] - firstvertex;
4911 e[1] = element3i[i*3+1] - firstvertex;
4912 e[2] = element3i[i*3+2] - firstvertex;
4921 #define SKIPBACKFACE \
4922 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4923 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4924 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4925 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4926 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4930 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4934 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4939 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4940 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4942 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4943 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4945 #define CLIPPEDVERTEXCOPY(k,p1) \
4946 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4948 #define GENATTRIBCOPY(attrib, p1) \
4949 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4950 #define GENATTRIBLERP(attrib, p1, p2) \
4952 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4953 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4955 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4959 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4960 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4961 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4962 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4963 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4964 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4965 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4971 // calculate distance from nearplane
4972 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4973 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4974 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4975 if (clipdist[0] >= 0.0f)
4977 if (clipdist[1] >= 0.0f)
4979 if (clipdist[2] >= 0.0f)
4982 // triangle is entirely in front of nearplane
4983 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4990 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4998 if (clipdist[2] >= 0.0f)
5000 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
5007 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
5014 else if (clipdist[1] >= 0.0f)
5016 if (clipdist[2] >= 0.0f)
5018 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
5025 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
5031 else if (clipdist[2] >= 0.0f)
5033 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
5038 else continue; // triangle is entirely behind nearplane
5041 // calculate integer y coords for triangle points
5042 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
5043 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
5044 screenmin = _mm_min_epi16(screeni, screenir),
5045 screenmax = _mm_max_epi16(screeni, screenir);
5046 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
5047 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
5048 screenmin = _mm_max_epi16(screenmin, fbmin);
5049 screenmax = _mm_min_epi16(screenmax, fbmax);
5050 // skip offscreen triangles
5051 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5053 starty = _mm_extract_epi16(screenmin, 1);
5054 endy = _mm_extract_epi16(screenmax, 1)+1;
5055 if (starty >= maxy1 && endy <= miny2)
5057 screeny = _mm_srai_epi32(screeni, 16);
5060 triangle = &thread->triangles[thread->numtriangles];
5062 // calculate attribute plans for triangle data...
5063 // okay, this triangle is going to produce spans, we'd better project
5064 // the interpolants now (this is what gives perspective texturing),
5065 // this consists of simply multiplying all arrays by the W coord
5066 // (which is basically 1/Z), which will be undone per-pixel
5067 // (multiplying by Z again) to get the perspective-correct array
5070 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5071 __m128 mipedgescale, mipdensity;
5072 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5073 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5074 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5075 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5076 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5077 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5078 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5079 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5080 attribedge1 = _mm_sub_ss(w0, w1);
5081 attribedge2 = _mm_sub_ss(w2, w1);
5082 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5083 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5084 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5085 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5086 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5087 _mm_store_ss(&triangle->w[0], attribxslope);
5088 _mm_store_ss(&triangle->w[1], attribyslope);
5089 _mm_store_ss(&triangle->w[2], attriborigin);
5094 if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
5096 float cliporigin, clipxslope, clipyslope;
5097 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5098 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5099 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5100 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5101 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5102 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5103 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
5104 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
5105 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
5108 clip0origin = -cliporigin/clipxslope;
5109 clip0slope = -clipyslope/clipxslope;
5110 clip0dir = clipxslope > 0 ? 1 : -1;
5112 else if(clipyslope > 0)
5114 clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5115 clip0slope = dpsoftrast.fb_width;
5118 else if(clipyslope < 0)
5120 clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5121 clip0slope = -dpsoftrast.fb_width;
5124 else if(clip0origin < 0) continue;
5127 mipedgescale = _mm_setzero_ps();
5128 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5130 __m128 attrib0, attrib1, attrib2;
5131 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5132 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5134 arrays += numvertices*4;
5135 GENATTRIBS(attrib0, attrib1, attrib2);
5136 attriborigin = _mm_mul_ps(attrib1, w1);
5137 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5138 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5139 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5140 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5141 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5142 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5143 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5144 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5145 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5147 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5148 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5149 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5150 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5154 memset(triangle->mip, 0, sizeof(triangle->mip));
5155 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5157 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5158 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5160 texture = thread->texbound[texunit];
5161 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5163 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5164 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5165 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5166 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5167 // this will be multiplied in the texturing routine by the texture resolution
5168 y = _mm_cvtss_si32(mipdensity);
5171 y = (int)(log((float)y)*0.5f/M_LN2);
5172 if (y > texture->mipmaps - 1)
5173 y = texture->mipmaps - 1;
5174 triangle->mip[texunit] = y;
5180 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5183 __m128 xcoords, xslope;
5184 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5185 int yccmask = _mm_movemask_epi8(ycc);
5186 int edge0p, edge0n, edge1p, edge1n;
5195 case 0xFFFF: /*0000*/ y = endy; continue;
5196 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5197 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5198 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5199 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5200 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5201 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5202 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5203 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5204 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5205 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5206 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5207 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5208 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5209 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5210 case 0x0000: /*1111*/ y++; continue;
5218 case 0xFFFF: /*000*/ y = endy; continue;
5219 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5220 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5221 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5222 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5223 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5224 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5225 case 0x0000: /*111*/ y++; continue;
5228 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5229 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5230 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5231 nexty = _mm_extract_epi16(ycc, 0);
5232 if (nexty >= bandy) nexty = bandy-1;
5233 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5234 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5235 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5236 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5237 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5238 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5240 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5241 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5243 clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5244 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5246 int startx, endx, offset;
5247 startx = _mm_cvtss_si32(xcoords);
5248 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5249 if (startx < minx) startx = minx;
5250 if (endx > maxx) endx = maxx;
5251 if (startx >= endx) continue;
5259 if(endx <= clip0) continue;
5260 startx = (int)clip0;
5263 else if (endx > clip0)
5265 if(startx >= clip0) continue;
5270 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5272 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5273 span->triangle = thread->numtriangles;
5277 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5278 if (span->startx >= span->endx)
5280 wslope = triangle->w[0];
5281 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5282 span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5283 span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5284 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5285 DPSOFTRAST_Draw_ProcessSpans(thread);
5290 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5292 DPSOFTRAST_Draw_ProcessSpans(thread);
5293 thread->numtriangles = 0;
5297 if (!ATOMIC_DECREMENT(command->refcount))
5299 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5300 MM_FREE(command->arrays);
5303 if (thread->numspans > 0 || thread->numtriangles > 0)
5305 DPSOFTRAST_Draw_ProcessSpans(thread);
5306 thread->numtriangles = 0;
5311 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5315 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5316 int datasize = 2*numvertices*sizeof(float[4]);
5317 DPSOFTRAST_Command_Draw *command;
5318 unsigned char *data;
5319 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5321 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5322 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5324 datasize += numvertices*sizeof(float[4]);
5327 datasize += numtriangles*sizeof(unsigned short[3]);
5329 datasize += numtriangles*sizeof(int[3]);
5330 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5331 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5333 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5334 data = (unsigned char *)MM_CALLOC(datasize, 1);
5338 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5339 data = (unsigned char *)command + commandsize;
5341 command->firstvertex = firstvertex;
5342 command->numvertices = numvertices;
5343 command->numtriangles = numtriangles;
5344 command->arrays = (float *)data;
5345 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5346 dpsoftrast.firstvertex = firstvertex;
5347 dpsoftrast.numvertices = numvertices;
5348 dpsoftrast.screencoord4f = (float *)data;
5349 data += numvertices*sizeof(float[4]);
5350 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5351 data += numvertices*sizeof(float[4]);
5352 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5354 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5355 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5357 dpsoftrast.post_array4f[j] = (float *)data;
5358 data += numvertices*sizeof(float[4]);
5360 command->element3i = NULL;
5361 command->element3s = NULL;
5364 command->element3s = (unsigned short *)data;
5365 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5369 command->element3i = (int *)data;
5370 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5375 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5377 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5378 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5379 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5380 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5381 if (command->starty >= command->endy)
5383 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5384 MM_FREE(command->arrays);
5385 DPSOFTRAST_UndoCommand(command->commandsize);
5388 command->clipped = dpsoftrast.drawclipped;
5389 command->refcount = dpsoftrast.numthreads;
5391 if (dpsoftrast.usethreads)
5394 DPSOFTRAST_Draw_SyncCommands();
5395 for (i = 0; i < dpsoftrast.numthreads; i++)
5397 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5398 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5399 Thread_CondSignal(thread->drawcond);
5404 DPSOFTRAST_Draw_FlushThreads();
5408 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5409 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5411 thread->validate |= DPSOFTRAST_VALIDATE_FB;
5413 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5415 DPSOFTRAST_Command_SetRenderTargets *command;
5416 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5417 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5418 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5420 dpsoftrast.fb_width = width;
5421 dpsoftrast.fb_height = height;
5422 dpsoftrast.fb_depthpixels = depthpixels;
5423 dpsoftrast.fb_colorpixels[0] = colorpixels0;
5424 dpsoftrast.fb_colorpixels[1] = colorpixels1;
5425 dpsoftrast.fb_colorpixels[2] = colorpixels2;
5426 dpsoftrast.fb_colorpixels[3] = colorpixels3;
5427 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5428 command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5429 command->width = width;
5430 command->height = height;
5433 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5435 int commandoffset = thread->commandoffset;
5436 while (commandoffset != endoffset)
5438 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5439 switch (command->opcode)
5441 #define INTERPCOMMAND(name) \
5442 case DPSOFTRAST_OPCODE_##name : \
5443 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5444 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5445 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5446 commandoffset = 0; \
5448 INTERPCOMMAND(Viewport)
5449 INTERPCOMMAND(ClearColor)
5450 INTERPCOMMAND(ClearDepth)
5451 INTERPCOMMAND(ColorMask)
5452 INTERPCOMMAND(DepthTest)
5453 INTERPCOMMAND(ScissorTest)
5454 INTERPCOMMAND(Scissor)
5455 INTERPCOMMAND(BlendFunc)
5456 INTERPCOMMAND(BlendSubtract)
5457 INTERPCOMMAND(DepthMask)
5458 INTERPCOMMAND(DepthFunc)
5459 INTERPCOMMAND(DepthRange)
5460 INTERPCOMMAND(PolygonOffset)
5461 INTERPCOMMAND(CullFace)
5462 INTERPCOMMAND(SetTexture)
5463 INTERPCOMMAND(SetShader)
5464 INTERPCOMMAND(Uniform4f)
5465 INTERPCOMMAND(UniformMatrix4f)
5466 INTERPCOMMAND(Uniform1i)
5467 INTERPCOMMAND(SetRenderTargets)
5468 INTERPCOMMAND(ClipPlane)
5470 case DPSOFTRAST_OPCODE_Draw:
5471 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5472 commandoffset += command->commandsize;
5473 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5475 thread->commandoffset = commandoffset;
5478 case DPSOFTRAST_OPCODE_Reset:
5483 thread->commandoffset = commandoffset;
5486 static int DPSOFTRAST_Draw_Thread(void *data)
5488 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5489 while(thread->index >= 0)
5491 if (thread->commandoffset != dpsoftrast.drawcommand)
5493 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5497 Thread_LockMutex(thread->drawmutex);
5498 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5500 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5501 thread->starving = true;
5502 Thread_CondWait(thread->drawcond, thread->drawmutex);
5503 thread->starving = false;
5505 Thread_UnlockMutex(thread->drawmutex);
5511 static void DPSOFTRAST_Draw_FlushThreads(void)
5513 DPSOFTRAST_State_Thread *thread;
5515 DPSOFTRAST_Draw_SyncCommands();
5516 if (dpsoftrast.usethreads)
5518 for (i = 0; i < dpsoftrast.numthreads; i++)
5520 thread = &dpsoftrast.threads[i];
5521 if (thread->commandoffset != dpsoftrast.drawcommand)
5523 Thread_LockMutex(thread->drawmutex);
5524 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5525 Thread_CondSignal(thread->drawcond);
5526 Thread_UnlockMutex(thread->drawmutex);
5529 for (i = 0; i < dpsoftrast.numthreads; i++)
5531 thread = &dpsoftrast.threads[i];
5532 if (thread->commandoffset != dpsoftrast.drawcommand)
5534 Thread_LockMutex(thread->drawmutex);
5535 if (thread->commandoffset != dpsoftrast.drawcommand)
5537 thread->waiting = true;
5538 Thread_CondWait(thread->waitcond, thread->drawmutex);
5539 thread->waiting = false;
5541 Thread_UnlockMutex(thread->drawmutex);
5547 for (i = 0; i < dpsoftrast.numthreads; i++)
5549 thread = &dpsoftrast.threads[i];
5550 if (thread->commandoffset != dpsoftrast.drawcommand)
5551 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5554 dpsoftrast.commandpool.usedcommands = 0;
5557 void DPSOFTRAST_Flush(void)
5559 DPSOFTRAST_Draw_FlushThreads();
5562 void DPSOFTRAST_Finish(void)
5567 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5577 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5578 dpsoftrast.bigendian = u.b[3];
5579 dpsoftrast.fb_width = width;
5580 dpsoftrast.fb_height = height;
5581 dpsoftrast.fb_depthpixels = depthpixels;
5582 dpsoftrast.fb_colorpixels[0] = colorpixels;
5583 dpsoftrast.fb_colorpixels[1] = NULL;
5584 dpsoftrast.fb_colorpixels[1] = NULL;
5585 dpsoftrast.fb_colorpixels[1] = NULL;
5586 dpsoftrast.viewport[0] = 0;
5587 dpsoftrast.viewport[1] = 0;
5588 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5589 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5590 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5591 dpsoftrast.texture_firstfree = 1;
5592 dpsoftrast.texture_end = 1;
5593 dpsoftrast.texture_max = 0;
5594 dpsoftrast.color[0] = 1;
5595 dpsoftrast.color[1] = 1;
5596 dpsoftrast.color[2] = 1;
5597 dpsoftrast.color[3] = 1;
5598 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5599 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5600 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5601 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5602 for (i = 0; i < dpsoftrast.numthreads; i++)
5604 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5606 thread->cullface = GL_BACK;
5607 thread->colormask[0] = 1;
5608 thread->colormask[1] = 1;
5609 thread->colormask[2] = 1;
5610 thread->colormask[3] = 1;
5611 thread->blendfunc[0] = GL_ONE;
5612 thread->blendfunc[1] = GL_ZERO;
5613 thread->depthmask = true;
5614 thread->depthtest = true;
5615 thread->depthfunc = GL_LEQUAL;
5616 thread->scissortest = false;
5617 thread->viewport[0] = 0;
5618 thread->viewport[1] = 0;
5619 thread->viewport[2] = dpsoftrast.fb_width;
5620 thread->viewport[3] = dpsoftrast.fb_height;
5621 thread->scissor[0] = 0;
5622 thread->scissor[1] = 0;
5623 thread->scissor[2] = dpsoftrast.fb_width;
5624 thread->scissor[3] = dpsoftrast.fb_height;
5625 thread->depthrange[0] = 0;
5626 thread->depthrange[1] = 1;
5627 thread->polygonoffset[0] = 0;
5628 thread->polygonoffset[1] = 0;
5629 thread->clipplane[0] = 0;
5630 thread->clipplane[1] = 0;
5631 thread->clipplane[2] = 0;
5632 thread->clipplane[3] = 1;
5634 thread->numspans = 0;
5635 thread->numtriangles = 0;
5636 thread->commandoffset = 0;
5637 thread->waiting = false;
5638 thread->starving = false;
5640 thread->validate = -1;
5641 DPSOFTRAST_Validate(thread, -1);
5643 if (dpsoftrast.usethreads)
5645 thread->waitcond = Thread_CreateCond();
5646 thread->drawcond = Thread_CreateCond();
5647 thread->drawmutex = Thread_CreateMutex();
5648 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5654 void DPSOFTRAST_Shutdown(void)
5657 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5659 DPSOFTRAST_State_Thread *thread;
5660 for (i = 0; i < dpsoftrast.numthreads; i++)
5662 thread = &dpsoftrast.threads[i];
5663 Thread_LockMutex(thread->drawmutex);
5665 Thread_CondSignal(thread->drawcond);
5666 Thread_UnlockMutex(thread->drawmutex);
5667 Thread_WaitThread(thread->thread, 0);
5668 Thread_DestroyCond(thread->waitcond);
5669 Thread_DestroyCond(thread->drawcond);
5670 Thread_DestroyMutex(thread->drawmutex);
5673 for (i = 0;i < dpsoftrast.texture_end;i++)
5674 if (dpsoftrast.texture[i].bytes)
5675 MM_FREE(dpsoftrast.texture[i].bytes);
5676 if (dpsoftrast.texture)
5677 free(dpsoftrast.texture);
5678 if (dpsoftrast.threads)
5679 MM_FREE(dpsoftrast.threads);
5680 memset(&dpsoftrast, 0, sizeof(dpsoftrast));