3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 #pragma warning(disable : 4324)
14 typedef qboolean bool;
21 #if defined(__APPLE__)
22 #include <libkern/OSAtomic.h>
23 #define ALIGN(var) var __attribute__((__aligned__(16)))
24 #define ATOMIC(var) var __attribute__((__aligned__(4)))
25 #define MEMORY_BARRIER (_mm_sfence())
26 #define ATOMIC_COUNTER volatile int32_t
27 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30 #elif defined(__GNUC__) && defined(WIN32)
31 #define ALIGN(var) var __attribute__((__aligned__(16)))
32 #define ATOMIC(var) var __attribute__((__aligned__(4)))
33 #define MEMORY_BARRIER (_mm_sfence())
34 //(__sync_synchronize())
35 #define ATOMIC_COUNTER volatile LONG
36 // this LONG * cast serves to fix an issue with broken mingw
37 // packages on Ubuntu; these only declare the function to take
38 // a LONG *, causing a compile error here. This seems to be
39 // error- and warn-free on platforms that DO declare
40 // InterlockedIncrement correctly, like mingw on Windows.
41 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement((LONG *) &(counter)))
42 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement((LONG *) &(counter)))
43 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd((LONG *) &(counter), (val)))
44 #elif defined(__GNUC__)
45 #define ALIGN(var) var __attribute__((__aligned__(16)))
46 #define ATOMIC(var) var __attribute__((__aligned__(4)))
47 #define MEMORY_BARRIER (_mm_sfence())
48 //(__sync_synchronize())
49 #define ATOMIC_COUNTER volatile int
50 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
51 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
52 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
53 #elif defined(_MSC_VER)
54 #define ALIGN(var) __declspec(align(16)) var
55 #define ATOMIC(var) __declspec(align(4)) var
56 #define MEMORY_BARRIER (_mm_sfence())
58 #define ATOMIC_COUNTER volatile LONG
59 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
60 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
61 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
66 #define ALIGN(var) var
69 #define ATOMIC(var) var
71 #ifndef MEMORY_BARRIER
72 #define MEMORY_BARRIER ((void)0)
74 #ifndef ATOMIC_COUNTER
75 #define ATOMIC_COUNTER int
77 #ifndef ATOMIC_INCREMENT
78 #define ATOMIC_INCREMENT(counter) (++(counter))
80 #ifndef ATOMIC_DECREMENT
81 #define ATOMIC_DECREMENT(counter) (--(counter))
84 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
88 #include <emmintrin.h>
90 #if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6) && !defined(__clang__)
91 #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
94 #define MM_MALLOC(size) _mm_malloc(size, ALIGN_SIZE)
96 static void *MM_CALLOC(size_t nmemb, size_t size)
98 void *ptr = _mm_malloc(nmemb*size, ALIGN_SIZE);
99 if (ptr != NULL) memset(ptr, 0, nmemb*size);
103 #define MM_FREE _mm_free
105 #define MM_MALLOC(size) malloc(size)
106 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
110 #define DPSOFTRAST_FLT_MIN 0.000000000000000001f
112 typedef enum DPSOFTRAST_ARRAY_e
114 DPSOFTRAST_ARRAY_POSITION,
115 DPSOFTRAST_ARRAY_COLOR,
116 DPSOFTRAST_ARRAY_TEXCOORD0,
117 DPSOFTRAST_ARRAY_TEXCOORD1,
118 DPSOFTRAST_ARRAY_TEXCOORD2,
119 DPSOFTRAST_ARRAY_TEXCOORD3,
120 DPSOFTRAST_ARRAY_TEXCOORD4,
121 DPSOFTRAST_ARRAY_TEXCOORD5,
122 DPSOFTRAST_ARRAY_TEXCOORD6,
123 DPSOFTRAST_ARRAY_TEXCOORD7,
124 DPSOFTRAST_ARRAY_TOTAL
128 typedef struct DPSOFTRAST_Texture_s
135 DPSOFTRAST_TEXTURE_FILTER filter;
138 ATOMIC_COUNTER binds;
139 unsigned char *bytes;
140 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
144 #define COMMAND_SIZE ALIGN_SIZE
145 #define COMMAND_ALIGN(var) ALIGN(var)
147 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
149 unsigned char opcode;
150 unsigned short commandsize;
154 enum { DPSOFTRAST_OPCODE_Reset = 0 };
156 #define DEFCOMMAND(opcodeval, name, fields) \
157 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
158 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
160 unsigned char opcode; \
161 unsigned short commandsize; \
163 } DPSOFTRAST_Command_##name );
165 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
166 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
168 typedef ALIGN(struct DPSOFTRAST_State_Command_Pool_s
172 ALIGN(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
174 DPSOFTRAST_State_Command_Pool);
176 typedef ALIGN(struct DPSOFTRAST_State_Triangle_s
178 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
180 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
182 DPSOFTRAST_State_Triangle);
184 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
185 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
186 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
187 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
188 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
190 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
191 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
192 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
193 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
194 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
195 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
196 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
197 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
198 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
201 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
203 typedef ALIGN(struct DPSOFTRAST_State_Span_s
205 int triangle; // triangle this span was generated by
206 int x; // framebuffer x coord
207 int y; // framebuffer y coord
208 int startx; // usable range (according to pixelmask)
209 int endx; // usable range (according to pixelmask)
210 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
211 int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
212 int depthslope; // depthbuffer value pixel delta
214 DPSOFTRAST_State_Span);
216 #define DPSOFTRAST_DRAW_MAXSPANS 1024
217 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
218 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
220 #define DPSOFTRAST_VALIDATE_FB 1
221 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
222 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
223 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
225 typedef enum DPSOFTRAST_BLENDMODE_e
227 DPSOFTRAST_BLENDMODE_OPAQUE,
228 DPSOFTRAST_BLENDMODE_ALPHA,
229 DPSOFTRAST_BLENDMODE_ADDALPHA,
230 DPSOFTRAST_BLENDMODE_ADD,
231 DPSOFTRAST_BLENDMODE_INVMOD,
232 DPSOFTRAST_BLENDMODE_MUL,
233 DPSOFTRAST_BLENDMODE_MUL2,
234 DPSOFTRAST_BLENDMODE_SUBALPHA,
235 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
236 DPSOFTRAST_BLENDMODE_INVADD,
237 DPSOFTRAST_BLENDMODE_TOTAL
239 DPSOFTRAST_BLENDMODE;
241 typedef ALIGN(struct DPSOFTRAST_State_Thread_s
257 float polygonoffset[2];
259 ALIGN(float fb_clipplane[4]);
262 int shader_permutation;
263 int shader_exactspecularmath;
265 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
267 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
268 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
270 // DPSOFTRAST_VALIDATE_ flags
273 // derived values (DPSOFTRAST_VALIDATE_FB)
276 ALIGN(float fb_viewportcenter[4]);
277 ALIGN(float fb_viewportscale[4]);
279 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
282 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
291 ATOMIC(volatile int commandoffset);
293 volatile bool waiting;
294 volatile bool starving;
301 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
302 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
303 unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
305 DPSOFTRAST_State_Thread);
307 typedef ALIGN(struct DPSOFTRAST_State_s
311 unsigned int *fb_depthpixels;
312 unsigned int *fb_colorpixels[4];
315 ALIGN(float fb_viewportcenter[4]);
316 ALIGN(float fb_viewportscale[4]);
319 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
320 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
322 const float *pointer_vertex3f;
323 const float *pointer_color4f;
324 const unsigned char *pointer_color4ub;
325 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
328 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
329 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
330 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
334 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
335 float *screencoord4f;
341 int shader_permutation;
342 int shader_exactspecularmath;
346 int texture_firstfree;
347 DPSOFTRAST_Texture *texture;
352 const char *errorstring;
357 DPSOFTRAST_State_Thread *threads;
359 ATOMIC(volatile int drawcommand);
361 DPSOFTRAST_State_Command_Pool commandpool;
365 DPSOFTRAST_State dpsoftrast;
367 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
368 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
369 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
370 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
372 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
373 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
375 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
377 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
378 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
379 fb_viewportcenter[3] = 0.5f;
380 fb_viewportcenter[0] = 0.0f;
381 fb_viewportscale[1] = 0.5f * viewport[2];
382 fb_viewportscale[2] = -0.5f * viewport[3];
383 fb_viewportscale[3] = 0.5f;
384 fb_viewportscale[0] = 1.0f;
387 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
389 if (dpsoftrast.interlace)
391 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
392 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
393 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
394 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
398 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
399 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
403 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
405 thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
406 thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
407 thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
408 thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
409 thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
412 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
414 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
415 // and viewport projection values
418 x1 = thread->scissor[0];
419 x2 = thread->scissor[0] + thread->scissor[2];
420 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
421 y2 = dpsoftrast.fb_height - thread->scissor[1];
422 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
424 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
426 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
427 thread->fb_scissor[0] = x1;
428 thread->fb_scissor[1] = y1;
429 thread->fb_scissor[2] = x2 - x1;
430 thread->fb_scissor[3] = y2 - y1;
432 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
433 DPSOFTRAST_RecalcClipPlane(thread);
434 DPSOFTRAST_RecalcThread(thread);
437 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
439 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
442 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
444 if (thread->blendsubtract)
446 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
448 #define BLENDFUNC(sfactor, dfactor, blendmode) \
449 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
450 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
451 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
456 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
458 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
459 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
460 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
461 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
462 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
463 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
464 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
465 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
466 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
467 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
468 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
473 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
475 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
477 mask &= thread->validate;
480 if (mask & DPSOFTRAST_VALIDATE_FB)
482 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
483 DPSOFTRAST_RecalcFB(thread);
485 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
487 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
488 DPSOFTRAST_RecalcDepthFunc(thread);
490 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
492 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
493 DPSOFTRAST_RecalcBlendFunc(thread);
497 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
499 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
500 return &dpsoftrast.texture[index];
504 static void DPSOFTRAST_Texture_Grow(void)
506 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
507 DPSOFTRAST_State_Thread *thread;
511 // expand texture array as needed
512 if (dpsoftrast.texture_max < 1024)
513 dpsoftrast.texture_max = 1024;
515 dpsoftrast.texture_max *= 2;
516 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
517 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
518 if (dpsoftrast.texbound[i])
519 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
520 for (j = 0; j < dpsoftrast.numthreads; j++)
522 thread = &dpsoftrast.threads[j];
523 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
524 if (thread->texbound[i])
525 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
529 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
538 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
539 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
540 DPSOFTRAST_Texture *texture;
541 if (width*height*depth < 1)
543 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
546 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
548 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
553 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
554 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
555 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
557 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
558 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
560 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
565 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
568 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
570 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
575 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
577 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
580 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
582 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
585 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
587 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
590 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
592 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
595 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
597 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
600 // find first empty slot in texture array
601 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
602 if (!dpsoftrast.texture[texnum].bytes)
604 dpsoftrast.texture_firstfree = texnum + 1;
605 if (dpsoftrast.texture_max <= texnum)
606 DPSOFTRAST_Texture_Grow();
607 if (dpsoftrast.texture_end <= texnum)
608 dpsoftrast.texture_end = texnum + 1;
609 texture = &dpsoftrast.texture[texnum];
610 memset(texture, 0, sizeof(*texture));
611 texture->flags = flags;
612 texture->width = width;
613 texture->height = height;
614 texture->depth = depth;
615 texture->sides = sides;
627 s = w * h * d * sides * 4;
628 texture->mipmap[mipmaps][0] = size;
629 texture->mipmap[mipmaps][1] = s;
630 texture->mipmap[mipmaps][2] = w;
631 texture->mipmap[mipmaps][3] = h;
632 texture->mipmap[mipmaps][4] = d;
635 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
641 texture->mipmaps = mipmaps;
642 texture->size = size;
644 // allocate the pixels now
645 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
649 void DPSOFTRAST_Texture_Free(int index)
651 DPSOFTRAST_Texture *texture;
652 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
656 MM_FREE(texture->bytes);
657 texture->bytes = NULL;
658 memset(texture, 0, sizeof(*texture));
659 // adjust the free range and used range
660 if (dpsoftrast.texture_firstfree > index)
661 dpsoftrast.texture_firstfree = index;
662 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
663 dpsoftrast.texture_end--;
665 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
667 int i, x, y, z, w, layer0, layer1, row0, row1;
668 unsigned char *o, *i0, *i1, *i2, *i3;
669 DPSOFTRAST_Texture *texture;
670 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
671 if (texture->mipmaps <= 1)
673 for (i = 1;i < texture->mipmaps;i++)
675 for (z = 0;z < texture->mipmap[i][4];z++)
679 if (layer1 >= texture->mipmap[i-1][4])
680 layer1 = texture->mipmap[i-1][4]-1;
681 for (y = 0;y < texture->mipmap[i][3];y++)
685 if (row1 >= texture->mipmap[i-1][3])
686 row1 = texture->mipmap[i-1][3]-1;
687 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
688 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
689 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
690 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
691 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
692 w = texture->mipmap[i][2];
695 if (texture->mipmap[i-1][2] > 1)
697 // average 3D texture
698 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
700 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
701 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
702 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
703 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
708 // average 3D mipmap with parent width == 1
709 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
711 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
712 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
713 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
714 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
720 if (texture->mipmap[i-1][2] > 1)
722 // average 2D texture (common case)
723 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
725 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
726 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
727 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
728 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
733 // 2D texture with parent width == 1
734 o[0] = (i0[0] + i1[0] + 1) >> 1;
735 o[1] = (i0[1] + i1[1] + 1) >> 1;
736 o[2] = (i0[2] + i1[2] + 1) >> 1;
737 o[3] = (i0[3] + i1[3] + 1) >> 1;
744 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
746 DPSOFTRAST_Texture *texture;
748 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
753 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
754 while (blockheight > 0)
756 memcpy(dst, pixels, blockwidth * 4);
757 pixels += blockwidth * 4;
758 dst += texture->mipmap[0][2] * 4;
762 DPSOFTRAST_Texture_CalculateMipmaps(index);
764 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
766 DPSOFTRAST_Texture *texture;
767 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
771 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
772 DPSOFTRAST_Texture_CalculateMipmaps(index);
774 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
776 DPSOFTRAST_Texture *texture;
777 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
778 return texture->mipmap[mip][2];
780 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
782 DPSOFTRAST_Texture *texture;
783 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
784 return texture->mipmap[mip][3];
786 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
788 DPSOFTRAST_Texture *texture;
789 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
790 return texture->mipmap[mip][4];
792 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
794 DPSOFTRAST_Texture *texture;
795 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
798 return texture->bytes + texture->mipmap[mip][0];
800 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
802 DPSOFTRAST_Texture *texture;
803 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
804 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
806 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
811 texture->filter = filter;
814 static void DPSOFTRAST_Draw_FlushThreads(void);
816 static void DPSOFTRAST_Draw_SyncCommands(void)
818 if(dpsoftrast.usethreads) MEMORY_BARRIER;
819 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
822 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
824 DPSOFTRAST_State_Thread *thread;
826 int freecommand = dpsoftrast.commandpool.freecommand;
827 int usedcommands = dpsoftrast.commandpool.usedcommands;
828 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
830 DPSOFTRAST_Draw_SyncCommands();
836 for (i = 0; i < dpsoftrast.numthreads; i++)
838 thread = &dpsoftrast.threads[i];
839 commandoffset = freecommand - thread->commandoffset;
840 if (commandoffset < 0)
841 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
842 if (commandoffset > usedcommands)
845 usedcommands = commandoffset;
848 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
850 thread = &dpsoftrast.threads[waitindex];
851 Thread_LockMutex(thread->drawmutex);
852 if (thread->commandoffset != dpsoftrast.drawcommand)
854 thread->waiting = true;
855 if (thread->starving) Thread_CondSignal(thread->drawcond);
856 Thread_CondWait(thread->waitcond, thread->drawmutex);
857 thread->waiting = false;
859 Thread_UnlockMutex(thread->drawmutex);
861 dpsoftrast.commandpool.usedcommands = usedcommands;
864 #define DPSOFTRAST_ALIGNCOMMAND(size) \
865 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
866 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
867 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
869 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
871 DPSOFTRAST_Command *command;
872 int freecommand = dpsoftrast.commandpool.freecommand;
873 int usedcommands = dpsoftrast.commandpool.usedcommands;
874 int extra = sizeof(DPSOFTRAST_Command);
875 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
876 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
877 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
879 if (dpsoftrast.usethreads)
880 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
882 DPSOFTRAST_Draw_FlushThreads();
883 freecommand = dpsoftrast.commandpool.freecommand;
884 usedcommands = dpsoftrast.commandpool.usedcommands;
886 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
888 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
889 command->opcode = DPSOFTRAST_OPCODE_Reset;
890 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
893 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
894 command->opcode = opcode;
895 command->commandsize = size;
897 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
899 dpsoftrast.commandpool.freecommand = freecommand;
900 dpsoftrast.commandpool.usedcommands = usedcommands + size;
904 static void DPSOFTRAST_UndoCommand(int size)
906 int freecommand = dpsoftrast.commandpool.freecommand;
907 int usedcommands = dpsoftrast.commandpool.usedcommands;
910 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
911 usedcommands -= size;
912 dpsoftrast.commandpool.freecommand = freecommand;
913 dpsoftrast.commandpool.usedcommands = usedcommands;
916 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
917 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
919 thread->viewport[0] = command->x;
920 thread->viewport[1] = command->y;
921 thread->viewport[2] = command->width;
922 thread->viewport[3] = command->height;
923 thread->validate |= DPSOFTRAST_VALIDATE_FB;
925 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
927 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
930 command->width = width;
931 command->height = height;
933 dpsoftrast.viewport[0] = x;
934 dpsoftrast.viewport[1] = y;
935 dpsoftrast.viewport[2] = width;
936 dpsoftrast.viewport[3] = height;
937 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
940 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
941 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
943 int i, x1, y1, x2, y2, w, h, x, y;
944 int miny1, maxy1, miny2, maxy2;
948 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
949 miny1 = thread->miny1;
950 maxy1 = thread->maxy1;
951 miny2 = thread->miny2;
952 maxy2 = thread->maxy2;
953 x1 = thread->fb_scissor[0];
954 y1 = thread->fb_scissor[1];
955 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
956 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
957 if (y1 < miny1) y1 = miny1;
958 if (y2 > maxy2) y2 = maxy2;
963 // FIXME: honor fb_colormask?
964 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
965 for (i = 0;i < 4;i++)
967 if (!dpsoftrast.fb_colorpixels[i])
969 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
972 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
973 for (x = x1;x < x2;x++)
978 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
980 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
987 DEFCOMMAND(3, ClearDepth, float depth;)
988 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
990 int x1, y1, x2, y2, w, h, x, y;
991 int miny1, maxy1, miny2, maxy2;
995 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
996 miny1 = thread->miny1;
997 maxy1 = thread->maxy1;
998 miny2 = thread->miny2;
999 maxy2 = thread->maxy2;
1000 x1 = thread->fb_scissor[0];
1001 y1 = thread->fb_scissor[1];
1002 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
1003 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
1004 if (y1 < miny1) y1 = miny1;
1005 if (y2 > maxy2) y2 = maxy2;
1010 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
1011 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
1012 for (;y < bandy;y++)
1014 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
1015 for (x = x1;x < x2;x++)
1019 void DPSOFTRAST_ClearDepth(float d)
1021 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1025 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1026 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1028 thread->colormask[0] = command->r != 0;
1029 thread->colormask[1] = command->g != 0;
1030 thread->colormask[2] = command->b != 0;
1031 thread->colormask[3] = command->a != 0;
1032 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1034 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1036 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1043 DEFCOMMAND(5, DepthTest, int enable;)
1044 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1046 thread->depthtest = command->enable;
1047 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1049 void DPSOFTRAST_DepthTest(int enable)
1051 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1052 command->enable = enable;
1055 DEFCOMMAND(6, ScissorTest, int enable;)
1056 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1058 thread->scissortest = command->enable;
1059 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1061 void DPSOFTRAST_ScissorTest(int enable)
1063 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1064 command->enable = enable;
1067 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1068 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1070 thread->scissor[0] = command->x;
1071 thread->scissor[1] = command->y;
1072 thread->scissor[2] = command->width;
1073 thread->scissor[3] = command->height;
1074 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1076 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1078 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1081 command->width = width;
1082 command->height = height;
1085 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1086 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1088 thread->blendfunc[0] = command->sfactor;
1089 thread->blendfunc[1] = command->dfactor;
1090 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1092 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1094 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1095 command->sfactor = sfactor;
1096 command->dfactor = dfactor;
1099 DEFCOMMAND(9, BlendSubtract, int enable;)
1100 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1102 thread->blendsubtract = command->enable;
1103 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1105 void DPSOFTRAST_BlendSubtract(int enable)
1107 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1108 command->enable = enable;
1111 DEFCOMMAND(10, DepthMask, int enable;)
1112 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1114 thread->depthmask = command->enable;
1116 void DPSOFTRAST_DepthMask(int enable)
1118 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1119 command->enable = enable;
1122 DEFCOMMAND(11, DepthFunc, int func;)
1123 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1125 thread->depthfunc = command->func;
1127 void DPSOFTRAST_DepthFunc(int func)
1129 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1130 command->func = func;
1133 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1134 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1136 thread->depthrange[0] = command->nearval;
1137 thread->depthrange[1] = command->farval;
1139 void DPSOFTRAST_DepthRange(float nearval, float farval)
1141 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1142 command->nearval = nearval;
1143 command->farval = farval;
1146 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1147 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1149 thread->polygonoffset[0] = command->alongnormal;
1150 thread->polygonoffset[1] = command->intoview;
1152 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1154 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1155 command->alongnormal = alongnormal;
1156 command->intoview = intoview;
1159 DEFCOMMAND(14, CullFace, int mode;)
1160 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1162 thread->cullface = command->mode;
1164 void DPSOFTRAST_CullFace(int mode)
1166 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1167 command->mode = mode;
1170 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1172 dpsoftrast.color[0] = r;
1173 dpsoftrast.color[1] = g;
1174 dpsoftrast.color[2] = b;
1175 dpsoftrast.color[3] = a;
1178 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1180 int outstride = blockwidth * 4;
1181 int instride = dpsoftrast.fb_width * 4;
1184 int bx2 = blockx + blockwidth;
1185 int by2 = blocky + blockheight;
1189 unsigned char *inpixels;
1193 if (bx1 < 0) bx1 = 0;
1194 if (by1 < 0) by1 = 0;
1195 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1196 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1198 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1199 if (dpsoftrast.bigendian)
1201 for (y = by1;y < by2;y++)
1203 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1204 o = (unsigned char *)outpixels + (y - by1) * outstride;
1205 for (x = bx1;x < bx2;x++)
1218 for (y = by1;y < by2;y++)
1220 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1221 o = (unsigned char *)outpixels + (y - by1) * outstride;
1227 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1231 int tx2 = tx + width;
1232 int ty2 = ty + height;
1235 int sx2 = sx + width;
1236 int sy2 = sy + height;
1246 unsigned int *spixels;
1247 unsigned int *tpixels;
1248 DPSOFTRAST_Texture *texture;
1249 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1250 if (mip < 0 || mip >= texture->mipmaps) return;
1252 spixels = dpsoftrast.fb_colorpixels[0];
1253 swidth = dpsoftrast.fb_width;
1254 sheight = dpsoftrast.fb_height;
1255 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1256 twidth = texture->mipmap[mip][2];
1257 theight = texture->mipmap[mip][3];
1258 if (tx1 < 0) tx1 = 0;
1259 if (ty1 < 0) ty1 = 0;
1260 if (tx2 > twidth) tx2 = twidth;
1261 if (ty2 > theight) ty2 = theight;
1262 if (sx1 < 0) sx1 = 0;
1263 if (sy1 < 0) sy1 = 0;
1264 if (sx2 > swidth) sx2 = swidth;
1265 if (sy2 > sheight) sy2 = sheight;
1270 if (tw > sw) tw = sw;
1271 if (th > sh) th = sh;
1272 if (tw < 1 || th < 1)
1274 sy1 = sheight - 1 - sy1;
1275 for (y = 0;y < th;y++)
1276 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1277 if (texture->mipmaps > 1)
1278 DPSOFTRAST_Texture_CalculateMipmaps(index);
1281 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1282 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1284 if (thread->texbound[command->unitnum])
1285 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1286 thread->texbound[command->unitnum] = command->texture;
1288 void DPSOFTRAST_SetTexture(int unitnum, int index)
1290 DPSOFTRAST_Command_SetTexture *command;
1291 DPSOFTRAST_Texture *texture;
1292 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1294 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1297 texture = DPSOFTRAST_Texture_GetByIndex(index);
1298 if (index && !texture)
1300 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1304 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1305 command->unitnum = unitnum;
1306 command->texture = texture;
1308 dpsoftrast.texbound[unitnum] = texture;
1309 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1312 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1314 dpsoftrast.pointer_vertex3f = vertex3f;
1315 dpsoftrast.stride_vertex = stride;
1317 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1319 dpsoftrast.pointer_color4f = color4f;
1320 dpsoftrast.pointer_color4ub = NULL;
1321 dpsoftrast.stride_color = stride;
1323 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1325 dpsoftrast.pointer_color4f = NULL;
1326 dpsoftrast.pointer_color4ub = color4ub;
1327 dpsoftrast.stride_color = stride;
1329 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1331 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1332 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1333 dpsoftrast.stride_texcoord[unitnum] = stride;
1336 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1337 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1339 thread->shader_mode = command->mode;
1340 thread->shader_permutation = command->permutation;
1341 thread->shader_exactspecularmath = command->exactspecularmath;
1343 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1345 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1346 command->mode = mode;
1347 command->permutation = permutation;
1348 command->exactspecularmath = exactspecularmath;
1350 dpsoftrast.shader_mode = mode;
1351 dpsoftrast.shader_permutation = permutation;
1352 dpsoftrast.shader_exactspecularmath = exactspecularmath;
1355 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1356 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1358 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1360 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1362 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1363 command->index = index;
1364 command->val[0] = v0;
1365 command->val[1] = v1;
1366 command->val[2] = v2;
1367 command->val[3] = v3;
1369 dpsoftrast.uniform4f[index*4+0] = v0;
1370 dpsoftrast.uniform4f[index*4+1] = v1;
1371 dpsoftrast.uniform4f[index*4+2] = v2;
1372 dpsoftrast.uniform4f[index*4+3] = v3;
1374 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1376 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1377 command->index = index;
1378 memcpy(command->val, v, sizeof(command->val));
1380 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1383 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1384 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1386 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1388 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1392 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1394 __m128 m0, m1, m2, m3;
1395 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1396 command->index = (DPSOFTRAST_UNIFORM)index;
1397 if (((size_t)v)&(ALIGN_SIZE-1))
1399 m0 = _mm_loadu_ps(v);
1400 m1 = _mm_loadu_ps(v+4);
1401 m2 = _mm_loadu_ps(v+8);
1402 m3 = _mm_loadu_ps(v+12);
1406 m0 = _mm_load_ps(v);
1407 m1 = _mm_load_ps(v+4);
1408 m2 = _mm_load_ps(v+8);
1409 m3 = _mm_load_ps(v+12);
1413 __m128 t0, t1, t2, t3;
1414 t0 = _mm_unpacklo_ps(m0, m1);
1415 t1 = _mm_unpacklo_ps(m2, m3);
1416 t2 = _mm_unpackhi_ps(m0, m1);
1417 t3 = _mm_unpackhi_ps(m2, m3);
1418 m0 = _mm_movelh_ps(t0, t1);
1419 m1 = _mm_movehl_ps(t1, t0);
1420 m2 = _mm_movelh_ps(t2, t3);
1421 m3 = _mm_movehl_ps(t3, t2);
1423 _mm_store_ps(command->val, m0);
1424 _mm_store_ps(command->val+4, m1);
1425 _mm_store_ps(command->val+8, m2);
1426 _mm_store_ps(command->val+12, m3);
1427 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1428 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1429 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1430 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1435 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1436 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1438 thread->uniform1i[command->index] = command->val;
1440 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1442 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1443 command->index = index;
1446 dpsoftrast.uniform1i[command->index] = i0;
1449 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1450 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1452 memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1453 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1455 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1457 DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1458 command->clipplane[0] = x;
1459 command->clipplane[1] = y;
1460 command->clipplane[2] = z;
1461 command->clipplane[3] = w;
1465 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1467 float *end = dst + size*4;
1468 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1472 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1481 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1488 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1490 float *end = dst + size*4;
1491 if (stride == sizeof(float[3]))
1493 float *end4 = dst + (size&~3)*4;
1494 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1498 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1499 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1500 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1501 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1502 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1503 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1504 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1505 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1506 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1507 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1508 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1509 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1510 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1512 src += 4*sizeof(float[3]);
1519 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1520 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1521 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1522 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1523 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1524 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1525 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1526 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1527 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1528 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1529 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1530 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1531 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1533 src += 4*sizeof(float[3]);
1537 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1541 __m128 v = _mm_loadu_ps((const float *)src);
1542 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1543 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1544 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1545 _mm_store_ps(dst, v);
1554 __m128 v = _mm_load_ps((const float *)src);
1555 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1556 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1557 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1558 _mm_store_ps(dst, v);
1565 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1567 float *end = dst + size*4;
1568 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1569 if (stride == sizeof(float[2]))
1571 float *end2 = dst + (size&~1)*4;
1572 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1576 __m128 v = _mm_loadu_ps((const float *)src);
1577 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1578 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1580 src += 2*sizeof(float[2]);
1587 __m128 v = _mm_load_ps((const float *)src);
1588 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1589 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1591 src += 2*sizeof(float[2]);
1597 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1603 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1605 float *end = dst + size*4;
1606 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1607 if (stride == sizeof(unsigned char[4]))
1609 float *end4 = dst + (size&~3)*4;
1610 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1614 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1615 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1616 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1617 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1618 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1620 src += 4*sizeof(unsigned char[4]);
1627 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1628 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1629 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1630 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1631 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1633 src += 4*sizeof(unsigned char[4]);
1639 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1640 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1646 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1648 float *end = dst + 4*size;
1649 __m128 v = _mm_loadu_ps(src);
1652 _mm_store_ps(dst, v);
1658 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1661 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1662 __m128 m0, m1, m2, m3;
1664 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1666 // fast case for identity matrix
1667 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1670 end = out4f + numitems*4;
1671 m0 = _mm_loadu_ps(inmatrix16f);
1672 m1 = _mm_loadu_ps(inmatrix16f + 4);
1673 m2 = _mm_loadu_ps(inmatrix16f + 8);
1674 m3 = _mm_loadu_ps(inmatrix16f + 12);
1675 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1679 __m128 v = _mm_loadu_ps(in4f);
1681 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1682 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1683 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1684 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1693 __m128 v = _mm_load_ps(in4f);
1695 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1696 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1697 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1698 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1706 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1708 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1712 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1714 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1715 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1716 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1717 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1720 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1722 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1723 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1724 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1725 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1728 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1731 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1732 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1733 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1734 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1737 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1739 int clipmask = 0xFF;
1740 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1741 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1742 __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1743 __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1744 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1745 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1746 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1747 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1748 #define BBFRONT(k, pos) \
1750 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1751 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1752 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1755 clipmask &= ~(1<<k); \
1756 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1757 minproj = _mm_min_ss(minproj, proj); \
1758 maxproj = _mm_max_ss(maxproj, proj); \
1762 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1763 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1764 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1765 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1766 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1767 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1771 if (clipmask&(1<<k)) \
1773 if (!(clipmask&(1<<(k^1)))) \
1775 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1776 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1777 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1778 minproj = _mm_min_ss(minproj, proj); \
1779 maxproj = _mm_max_ss(maxproj, proj); \
1781 if (!(clipmask&(1<<(k^2)))) \
1783 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1784 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1785 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1786 minproj = _mm_min_ss(minproj, proj); \
1787 maxproj = _mm_max_ss(maxproj, proj); \
1789 if (!(clipmask&(1<<(k^4)))) \
1791 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1792 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1793 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1794 minproj = _mm_min_ss(minproj, proj); \
1795 maxproj = _mm_max_ss(maxproj, proj); \
1799 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1800 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1801 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1802 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1803 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1804 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1805 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1806 *starty = _mm_cvttss_si32(maxproj);
1807 *endy = _mm_cvttss_si32(minproj)+1;
1811 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1813 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1814 float *end = out4f + numitems*4;
1815 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1816 __m128 minpos, maxpos;
1817 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1819 minpos = maxpos = _mm_loadu_ps(in4f);
1822 __m128 v = _mm_loadu_ps(in4f);
1823 minpos = _mm_min_ps(minpos, v);
1824 maxpos = _mm_max_ps(maxpos, v);
1825 _mm_store_ps(out4f, v);
1826 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1827 _mm_store_ps(screen4f, v);
1835 minpos = maxpos = _mm_load_ps(in4f);
1838 __m128 v = _mm_load_ps(in4f);
1839 minpos = _mm_min_ps(minpos, v);
1840 maxpos = _mm_max_ps(maxpos, v);
1841 _mm_store_ps(out4f, v);
1842 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1843 _mm_store_ps(screen4f, v);
1851 ALIGN(float minposf[4]);
1852 ALIGN(float maxposf[4]);
1853 _mm_store_ps(minposf, minpos);
1854 _mm_store_ps(maxposf, maxpos);
1855 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1860 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1862 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1863 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1865 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1866 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1867 end = out4f + numitems*4;
1868 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1869 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1870 m0 = _mm_loadu_ps(inmatrix16f);
1871 m1 = _mm_loadu_ps(inmatrix16f + 4);
1872 m2 = _mm_loadu_ps(inmatrix16f + 8);
1873 m3 = _mm_loadu_ps(inmatrix16f + 12);
1874 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1876 minpos = maxpos = _mm_loadu_ps(in4f);
1879 __m128 v = _mm_loadu_ps(in4f);
1880 minpos = _mm_min_ps(minpos, v);
1881 maxpos = _mm_max_ps(maxpos, v);
1882 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1883 _mm_store_ps(out4f, v);
1884 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1885 _mm_store_ps(screen4f, v);
1893 minpos = maxpos = _mm_load_ps(in4f);
1896 __m128 v = _mm_load_ps(in4f);
1897 minpos = _mm_min_ps(minpos, v);
1898 maxpos = _mm_max_ps(maxpos, v);
1899 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1900 _mm_store_ps(out4f, v);
1901 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1902 _mm_store_ps(screen4f, v);
1910 ALIGN(float minposf[4]);
1911 ALIGN(float maxposf[4]);
1912 _mm_store_ps(minposf, minpos);
1913 _mm_store_ps(maxposf, maxpos);
1914 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f);
1920 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1923 float *outf = dpsoftrast.post_array4f[outarray];
1924 const unsigned char *inb;
1925 int firstvertex = dpsoftrast.firstvertex;
1926 int numvertices = dpsoftrast.numvertices;
1930 case DPSOFTRAST_ARRAY_POSITION:
1931 stride = dpsoftrast.stride_vertex;
1932 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1933 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1935 case DPSOFTRAST_ARRAY_COLOR:
1936 stride = dpsoftrast.stride_color;
1937 if (dpsoftrast.pointer_color4f)
1939 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1940 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1942 else if (dpsoftrast.pointer_color4ub)
1944 stride = dpsoftrast.stride_color;
1945 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1946 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1950 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1954 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1955 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1957 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1958 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1961 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1964 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1967 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1979 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1981 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1982 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1987 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1990 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1991 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1999 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2002 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2003 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2010 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2013 int startx = span->startx;
2014 int endx = span->endx;
2015 float wslope = triangle->w[0];
2016 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2017 float endz = 1.0f / (w + wslope * startx);
2018 if (triangle->w[0] == 0)
2020 // LordHavoc: fast flat polygons (HUD/menu)
2021 for (x = startx;x < endx;x++)
2025 for (x = startx;x < endx;)
2027 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2029 if (nextsub >= endx) nextsub = endsub = endx-1;
2030 endz = 1.0f / (w + wslope * nextsub);
2031 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2032 for (; x <= endsub; x++, z += dz)
2037 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2041 int startx = span->startx;
2042 int endx = span->endx;
2045 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2046 unsigned char * RESTRICT pixelmask = span->pixelmask;
2047 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2048 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2051 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2052 pixeli += span->y * dpsoftrast.fb_width + span->x;
2053 // handle alphatest now (this affects depth writes too)
2054 if (thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL)
2055 for (x = startx;x < endx;x++)
2056 if (in4ub[x*4+3] < 128)
2057 pixelmask[x] = false;
2058 // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2059 // helps sprites, text and hud artwork
2060 switch(thread->fb_blendmode)
2062 case DPSOFTRAST_BLENDMODE_ALPHA:
2063 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2064 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2066 for (x = startx;x < endx;x++)
2068 if (in4ub[x*4+3] >= 1)
2073 while (++x < endx && in4ub[x*4+3] >= 1) ;
2075 if (x >= endx) break;
2077 while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2078 if (x >= endx) break;
2085 case DPSOFTRAST_BLENDMODE_OPAQUE:
2086 case DPSOFTRAST_BLENDMODE_ADD:
2087 case DPSOFTRAST_BLENDMODE_INVMOD:
2088 case DPSOFTRAST_BLENDMODE_MUL:
2089 case DPSOFTRAST_BLENDMODE_MUL2:
2090 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2091 case DPSOFTRAST_BLENDMODE_INVADD:
2094 // put some special values at the end of the mask to ensure the loops end
2095 pixelmask[endx] = 1;
2096 pixelmask[endx+1] = 0;
2097 // LordHavoc: use a double loop to identify subspans, this helps the
2098 // optimized copy/blend loops to perform at their best, most triangles
2099 // have only one run of pixels, and do the search using wide reads...
2103 // if this pixel is masked off, it's probably not alone...
2110 // the 4-item search must be aligned or else it stalls badly
2111 if ((x & 3) && !pixelmask[x])
2113 if(pixelmask[x]) goto endmasked;
2117 if(pixelmask[x]) goto endmasked;
2121 if(pixelmask[x]) goto endmasked;
2126 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2130 for (;!pixelmask[x];x++)
2132 // rather than continue the loop, just check the end variable
2137 // find length of subspan
2140 if (subx + 8 < endx)
2144 if(!pixelmask[subx]) goto endunmasked;
2148 if(!pixelmask[subx]) goto endunmasked;
2152 if(!pixelmask[subx]) goto endunmasked;
2157 while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2161 for (;pixelmask[subx];subx++)
2163 // the checks can overshoot, so make sure to clip it...
2167 // now that we know the subspan length... process!
2168 switch(thread->fb_blendmode)
2170 case DPSOFTRAST_BLENDMODE_OPAQUE:
2174 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2179 while (x + 16 <= subx)
2181 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2182 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2183 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2184 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2189 while (x + 4 <= subx)
2191 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2197 pixeli[x+1] = ini[x+1];
2207 case DPSOFTRAST_BLENDMODE_ALPHA:
2208 #define FINISHBLEND(blend2, blend1) \
2209 for (;x + 1 < subx;x += 2) \
2212 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2213 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2215 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2220 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2221 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2223 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2227 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2228 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2230 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2231 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2234 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2236 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2237 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2239 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2240 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2243 case DPSOFTRAST_BLENDMODE_ADD:
2244 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2246 case DPSOFTRAST_BLENDMODE_INVMOD:
2248 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2250 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2253 case DPSOFTRAST_BLENDMODE_MUL:
2254 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2256 case DPSOFTRAST_BLENDMODE_MUL2:
2257 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2259 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2261 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2262 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2264 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2265 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2268 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2270 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2271 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2273 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2274 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2277 case DPSOFTRAST_BLENDMODE_INVADD:
2279 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2281 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2289 static void DPSOFTRAST_Texture2DBGRA8(DPSOFTRAST_Texture *texture, int mip, float x, float y, unsigned char c[4])
2290 // warning: this is SLOW, only use if the optimized per-span functions won't do
2292 const unsigned char * RESTRICT pixelbase;
2293 const unsigned char * RESTRICT pixel[4];
2294 int width = texture->mipmap[mip][2], height = texture->mipmap[mip][3];
2295 int wrapmask[2] = { width-1, height-1 };
2296 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2297 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
2299 unsigned int tc[2] = { x * (width<<12) - 2048, y * (height<<12) - 2048};
2300 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2301 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2302 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2303 int tci[2] = { tc[0]>>12, tc[1]>>12 };
2304 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2305 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2307 tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2308 tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2309 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= wrapmask[0] ? tci1[0] : wrapmask[0]) : 0;
2310 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= wrapmask[1] ? tci1[1] : wrapmask[1]) : 0;
2314 tci[0] &= wrapmask[0];
2315 tci[1] &= wrapmask[1];
2316 tci1[0] &= wrapmask[0];
2317 tci1[1] &= wrapmask[1];
2319 pixel[0] = pixelbase + 4 * (tci[1]*width+tci[0]);
2320 pixel[1] = pixelbase + 4 * (tci[1]*width+tci1[0]);
2321 pixel[2] = pixelbase + 4 * (tci1[1]*width+tci[0]);
2322 pixel[3] = pixelbase + 4 * (tci1[1]*width+tci1[0]);
2323 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
2324 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
2325 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
2326 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
2330 int tci[2] = { x * width, y * height };
2331 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2333 tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2334 tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2338 tci[0] &= wrapmask[0];
2339 tci[1] &= wrapmask[1];
2341 pixel[0] = pixelbase + 4 * (tci[1]*width+tci[0]);
2349 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2352 int startx = span->startx;
2353 int endx = span->endx;
2358 float tc[2], endtc[2];
2360 unsigned int tci[2];
2361 unsigned int tci1[2];
2362 unsigned int tcimin[2];
2363 unsigned int tcimax[2];
2368 const unsigned char * RESTRICT pixelbase;
2369 const unsigned char * RESTRICT pixel[4];
2370 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2371 // if no texture is bound, just fill it with white
2374 for (x = startx;x < endx;x++)
2376 out4f[x*4+0] = 1.0f;
2377 out4f[x*4+1] = 1.0f;
2378 out4f[x*4+2] = 1.0f;
2379 out4f[x*4+3] = 1.0f;
2383 mip = triangle->mip[texunitindex];
2384 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2385 // if this mipmap of the texture is 1 pixel, just fill it with that color
2386 if (texture->mipmap[mip][1] == 4)
2388 c[0] = texture->bytes[2] * (1.0f/255.0f);
2389 c[1] = texture->bytes[1] * (1.0f/255.0f);
2390 c[2] = texture->bytes[0] * (1.0f/255.0f);
2391 c[3] = texture->bytes[3] * (1.0f/255.0f);
2392 for (x = startx;x < endx;x++)
2394 out4f[x*4+0] = c[0];
2395 out4f[x*4+1] = c[1];
2396 out4f[x*4+2] = c[2];
2397 out4f[x*4+3] = c[3];
2401 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2402 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2403 flags = texture->flags;
2404 tcscale[0] = texture->mipmap[mip][2];
2405 tcscale[1] = texture->mipmap[mip][3];
2406 tciwidth = texture->mipmap[mip][2];
2409 tcimax[0] = texture->mipmap[mip][2]-1;
2410 tcimax[1] = texture->mipmap[mip][3]-1;
2411 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2412 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2413 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2414 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2420 for (x = startx;x < endx;)
2422 unsigned int subtc[2];
2423 unsigned int substep[2];
2424 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2425 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2426 if (nextsub >= endx)
2428 nextsub = endsub = endx-1;
2429 if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2433 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2434 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2440 substep[0] = (endtc[0] - tc[0]) * subscale;
2441 substep[1] = (endtc[1] - tc[1]) * subscale;
2442 subtc[0] = tc[0] * (1<<12);
2443 subtc[1] = tc[1] * (1<<12);
2446 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2448 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2450 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2451 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2452 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2453 tci[0] = subtc[0]>>12;
2454 tci[1] = subtc[1]>>12;
2455 tci1[0] = tci[0] + 1;
2456 tci1[1] = tci[1] + 1;
2457 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2458 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2459 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2460 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2461 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2462 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2463 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2464 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2465 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2466 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2467 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2468 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2469 out4f[x*4+0] = c[0];
2470 out4f[x*4+1] = c[1];
2471 out4f[x*4+2] = c[2];
2472 out4f[x*4+3] = c[3];
2477 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2479 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2480 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2481 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2482 tci[0] = subtc[0]>>12;
2483 tci[1] = subtc[1]>>12;
2484 tci1[0] = tci[0] + 1;
2485 tci1[1] = tci[1] + 1;
2486 tci[0] &= tciwrapmask[0];
2487 tci[1] &= tciwrapmask[1];
2488 tci1[0] &= tciwrapmask[0];
2489 tci1[1] &= tciwrapmask[1];
2490 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2491 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2492 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2493 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2494 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2495 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2496 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2497 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2498 out4f[x*4+0] = c[0];
2499 out4f[x*4+1] = c[1];
2500 out4f[x*4+2] = c[2];
2501 out4f[x*4+3] = c[3];
2505 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2507 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2509 tci[0] = subtc[0]>>12;
2510 tci[1] = subtc[1]>>12;
2511 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2512 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2513 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2514 c[0] = pixel[0][2] * (1.0f / 255.0f);
2515 c[1] = pixel[0][1] * (1.0f / 255.0f);
2516 c[2] = pixel[0][0] * (1.0f / 255.0f);
2517 c[3] = pixel[0][3] * (1.0f / 255.0f);
2518 out4f[x*4+0] = c[0];
2519 out4f[x*4+1] = c[1];
2520 out4f[x*4+2] = c[2];
2521 out4f[x*4+3] = c[3];
2526 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2528 tci[0] = subtc[0]>>12;
2529 tci[1] = subtc[1]>>12;
2530 tci[0] &= tciwrapmask[0];
2531 tci[1] &= tciwrapmask[1];
2532 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2533 c[0] = pixel[0][2] * (1.0f / 255.0f);
2534 c[1] = pixel[0][1] * (1.0f / 255.0f);
2535 c[2] = pixel[0][0] * (1.0f / 255.0f);
2536 c[3] = pixel[0][3] * (1.0f / 255.0f);
2537 out4f[x*4+0] = c[0];
2538 out4f[x*4+1] = c[1];
2539 out4f[x*4+2] = c[2];
2540 out4f[x*4+3] = c[3];
2546 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2550 int startx = span->startx;
2551 int endx = span->endx;
2553 __m128 data, slope, tcscale;
2554 __m128i tcsize, tcmask, tcoffset, tcmax;
2556 __m128i subtc, substep, endsubtc;
2559 int affine; // LordHavoc: optimized affine texturing case
2560 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2561 const unsigned char * RESTRICT pixelbase;
2562 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2563 // if no texture is bound, just fill it with white
2566 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2569 mip = triangle->mip[texunitindex];
2570 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2571 // if this mipmap of the texture is 1 pixel, just fill it with that color
2572 if (texture->mipmap[mip][1] == 4)
2574 unsigned int k = *((const unsigned int *)pixelbase);
2575 for (x = startx;x < endx;x++)
2579 affine = zf[startx] == zf[endx-1];
2580 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2581 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2582 flags = texture->flags;
2583 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2584 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2585 tcscale = _mm_cvtepi32_ps(tcsize);
2586 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2587 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2588 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2590 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2591 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2592 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2593 tcmax = _mm_packs_epi32(tcmask, tcmask);
2594 for (x = startx;x < endx;)
2596 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2597 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2598 if (nextsub >= endx || affine)
2600 nextsub = endsub = endx-1;
2601 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2605 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2607 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2608 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2609 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2610 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2611 substep = _mm_slli_epi32(substep, 1);
2614 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2615 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2617 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2618 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2620 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2621 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2622 tci = _mm_madd_epi16(tci, tcoffset);
2623 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2624 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2625 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2626 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2627 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2628 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2629 fracm = _mm_srli_epi16(subtc, 1);
2630 pix1 = _mm_add_epi16(pix1,
2631 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2632 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2633 pix3 = _mm_add_epi16(pix3,
2634 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2635 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2636 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2637 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2638 pix2 = _mm_add_epi16(pix2,
2639 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2640 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2641 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2645 const unsigned char * RESTRICT ptr1;
2646 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2647 tci = _mm_madd_epi16(tci, tcoffset);
2648 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2649 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2650 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2651 fracm = _mm_srli_epi16(subtc, 1);
2652 pix1 = _mm_add_epi16(pix1,
2653 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2654 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2655 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2656 pix1 = _mm_add_epi16(pix1,
2657 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2658 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2659 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2663 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2665 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2667 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2668 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2669 tci = _mm_madd_epi16(tci, tcoffset);
2670 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2671 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2672 _mm_setzero_si128());
2673 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2674 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2675 _mm_setzero_si128());
2676 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2677 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2678 tci = _mm_madd_epi16(tci, tcoffset);
2679 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2680 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2681 _mm_setzero_si128());
2682 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2683 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2684 _mm_setzero_si128());
2685 fracm = _mm_srli_epi16(subtc, 1);
2686 pix1 = _mm_add_epi16(pix1,
2687 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2688 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2689 pix3 = _mm_add_epi16(pix3,
2690 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2691 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2692 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2693 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2694 pix2 = _mm_add_epi16(pix2,
2695 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2696 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2697 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2701 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2702 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2703 tci = _mm_madd_epi16(tci, tcoffset);
2704 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2705 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2706 _mm_setzero_si128());
2707 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2708 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2709 _mm_setzero_si128());
2710 fracm = _mm_srli_epi16(subtc, 1);
2711 pix1 = _mm_add_epi16(pix1,
2712 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2713 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2714 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2715 pix1 = _mm_add_epi16(pix1,
2716 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2717 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2718 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2724 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2726 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2727 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2728 tci = _mm_madd_epi16(tci, tcoffset);
2729 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2730 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2731 _mm_setzero_si128());
2732 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2733 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2734 _mm_setzero_si128());
2735 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2736 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2737 tci = _mm_madd_epi16(tci, tcoffset);
2738 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2739 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2740 _mm_setzero_si128());
2741 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2742 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2743 _mm_setzero_si128());
2744 fracm = _mm_srli_epi16(subtc, 1);
2745 pix1 = _mm_add_epi16(pix1,
2746 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2747 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2748 pix3 = _mm_add_epi16(pix3,
2749 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2750 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2751 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2752 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2753 pix2 = _mm_add_epi16(pix2,
2754 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2755 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2756 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2760 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2761 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2762 tci = _mm_madd_epi16(tci, tcoffset);
2763 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2764 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2765 _mm_setzero_si128());
2766 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2767 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2768 _mm_setzero_si128());
2769 fracm = _mm_srli_epi16(subtc, 1);
2770 pix1 = _mm_add_epi16(pix1,
2771 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2772 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2773 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2774 pix1 = _mm_add_epi16(pix1,
2775 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2776 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2777 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2784 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2786 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2788 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2789 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2790 tci = _mm_madd_epi16(tci, tcoffset);
2791 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2792 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2796 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2797 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2798 tci = _mm_madd_epi16(tci, tcoffset);
2799 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2805 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2807 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2808 tci = _mm_and_si128(tci, tcmax);
2809 tci = _mm_madd_epi16(tci, tcoffset);
2810 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2811 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2815 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2816 tci = _mm_and_si128(tci, tcmax);
2817 tci = _mm_madd_epi16(tci, tcoffset);
2818 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2827 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2830 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2833 float DPSOFTRAST_SampleShadowmap(const float *vector)
2839 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2842 int startx = span->startx;
2843 int endx = span->endx;
2848 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2849 for (x = startx;x < endx;x++)
2852 c[0] = (data[0] + slope[0]*x) * z;
2853 c[1] = (data[1] + slope[1]*x) * z;
2854 c[2] = (data[2] + slope[2]*x) * z;
2855 c[3] = (data[3] + slope[3]*x) * z;
2856 out4f[x*4+0] = in4f[x*4+0] * c[0];
2857 out4f[x*4+1] = in4f[x*4+1] * c[1];
2858 out4f[x*4+2] = in4f[x*4+2] * c[2];
2859 out4f[x*4+3] = in4f[x*4+3] * c[3];
2863 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2866 int startx = span->startx;
2867 int endx = span->endx;
2872 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2873 for (x = startx;x < endx;x++)
2876 c[0] = (data[0] + slope[0]*x) * z;
2877 c[1] = (data[1] + slope[1]*x) * z;
2878 c[2] = (data[2] + slope[2]*x) * z;
2879 c[3] = (data[3] + slope[3]*x) * z;
2880 out4f[x*4+0] = c[0];
2881 out4f[x*4+1] = c[1];
2882 out4f[x*4+2] = c[2];
2883 out4f[x*4+3] = c[3];
2887 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2889 int x, startx = span->startx, endx = span->endx;
2890 float c[4], localcolor[4];
2891 localcolor[0] = subcolor[0];
2892 localcolor[1] = subcolor[1];
2893 localcolor[2] = subcolor[2];
2894 localcolor[3] = subcolor[3];
2895 for (x = startx;x < endx;x++)
2897 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2898 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2899 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2900 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2901 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2902 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2903 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2904 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2908 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2910 int x, startx = span->startx, endx = span->endx;
2911 for (x = startx;x < endx;x++)
2913 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2914 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2915 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2916 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2920 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2922 int x, startx = span->startx, endx = span->endx;
2923 for (x = startx;x < endx;x++)
2925 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2926 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2927 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2928 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2932 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2934 int x, startx = span->startx, endx = span->endx;
2936 for (x = startx;x < endx;x++)
2938 a = 1.0f - inb4f[x*4+3];
2940 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2941 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2942 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2943 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2947 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2949 int x, startx = span->startx, endx = span->endx;
2950 float localcolor[4], ilerp, lerp;
2951 localcolor[0] = color[0];
2952 localcolor[1] = color[1];
2953 localcolor[2] = color[2];
2954 localcolor[3] = color[3];
2955 ilerp = 1.0f - localcolor[3];
2956 lerp = localcolor[3];
2957 for (x = startx;x < endx;x++)
2959 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2960 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2961 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2962 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2968 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2972 int startx = span->startx;
2973 int endx = span->endx;
2976 __m128i submod, substep, endsubmod;
2977 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2978 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2979 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2980 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2981 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2982 for (x = startx; x < endx;)
2984 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2985 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2986 if (nextsub >= endx)
2988 nextsub = endsub = endx-1;
2989 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2993 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2994 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2995 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2996 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2997 substep = _mm_packs_epi32(substep, substep);
2998 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3000 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3001 pix = _mm_mulhi_epu16(pix, submod);
3002 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3006 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3007 pix = _mm_mulhi_epu16(pix, submod);
3008 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3015 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3019 int startx = span->startx;
3020 int endx = span->endx;
3023 __m128i submod, substep, endsubmod;
3024 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3025 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3026 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3027 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3028 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3029 for (x = startx; x < endx;)
3031 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3032 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3033 if (nextsub >= endx)
3035 nextsub = endsub = endx-1;
3036 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3040 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3041 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3042 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3043 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3044 substep = _mm_packs_epi32(substep, substep);
3045 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3047 __m128i pix = _mm_srai_epi16(submod, 4);
3048 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3052 __m128i pix = _mm_srai_epi16(submod, 4);
3053 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3060 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3063 int x, startx = span->startx, endx = span->endx;
3064 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3065 localcolor = _mm_packs_epi32(localcolor, localcolor);
3066 for (x = startx;x+2 <= endx;x+=2)
3068 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3069 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3070 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3071 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3075 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3076 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3077 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3078 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3083 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3086 int x, startx = span->startx, endx = span->endx;
3087 for (x = startx;x+2 <= endx;x+=2)
3089 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3090 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3091 pix1 = _mm_mulhi_epu16(pix1, pix2);
3092 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3096 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3097 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3098 pix1 = _mm_mulhi_epu16(pix1, pix2);
3099 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3104 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3107 int x, startx = span->startx, endx = span->endx;
3108 for (x = startx;x+2 <= endx;x+=2)
3110 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3111 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3112 pix1 = _mm_add_epi16(pix1, pix2);
3113 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3117 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3118 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3119 pix1 = _mm_add_epi16(pix1, pix2);
3120 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3125 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3128 int x, startx = span->startx, endx = span->endx;
3129 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3130 tint = _mm_packs_epi32(tint, tint);
3131 for (x = startx;x+2 <= endx;x+=2)
3133 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3134 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3135 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3136 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3140 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3141 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3142 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3143 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3148 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3151 int x, startx = span->startx, endx = span->endx;
3152 for (x = startx;x+2 <= endx;x+=2)
3154 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3155 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3156 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3157 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3158 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3162 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3163 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3164 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3165 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3166 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3171 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3174 int x, startx = span->startx, endx = span->endx;
3175 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3176 localcolor = _mm_packs_epi32(localcolor, localcolor);
3177 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3178 for (x = startx;x+2 <= endx;x+=2)
3180 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3181 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3182 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3186 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3187 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3188 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3195 void DPSOFTRAST_VertexShader_Generic(void)
3197 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3198 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3199 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3200 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3201 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3204 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3206 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3207 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3208 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3209 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3210 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3211 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3213 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3214 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3215 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3217 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3218 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3221 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3223 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3226 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3228 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3231 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3236 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3237 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3242 void DPSOFTRAST_VertexShader_PostProcess(void)
3244 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3245 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3246 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3249 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3251 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3252 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3253 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3254 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3255 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3256 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3257 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3259 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3260 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3262 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3263 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3265 // TODO: implement saturation
3267 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3269 // TODO: implement gammaramps
3271 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3276 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3278 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3281 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3283 // this is never called (because colormask is off when this shader is used)
3284 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3285 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3286 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3287 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3288 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3293 void DPSOFTRAST_VertexShader_FlatColor(void)
3295 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3296 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3299 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3302 unsigned char * RESTRICT pixelmask = span->pixelmask;
3303 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3304 int x, startx = span->startx, endx = span->endx;
3305 __m128i Color_Ambientm;
3306 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3307 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3308 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3309 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3310 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3311 if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3312 pixel = buffer_FragColorbgra8;
3313 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3314 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3315 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3316 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3317 for (x = startx;x < endx;x++)
3320 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3323 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3324 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3325 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3326 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3332 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3333 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3334 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3336 if (pixel == buffer_FragColorbgra8)
3337 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3343 void DPSOFTRAST_VertexShader_VertexColor(void)
3345 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3346 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3347 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3350 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3353 unsigned char * RESTRICT pixelmask = span->pixelmask;
3354 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3355 int x, startx = span->startx, endx = span->endx;
3356 __m128i Color_Ambientm, Color_Diffusem;
3358 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3359 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3360 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3361 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3362 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3363 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3364 if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3365 pixel = buffer_FragColorbgra8;
3366 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3367 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3368 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3369 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3370 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3371 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3372 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3373 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3374 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3375 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3376 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3377 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3378 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3379 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3381 __m128i color, mod, pix;
3382 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3385 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3386 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3387 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3388 data = _mm_add_ps(data, slope);
3389 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3390 data = _mm_add_ps(data, slope);
3391 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3392 data = _mm_add_ps(data, slope);
3393 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3394 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3395 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3396 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3397 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3398 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3404 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3405 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3406 mod = _mm_packs_epi32(mod, mod);
3407 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3408 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3410 if (pixel == buffer_FragColorbgra8)
3411 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3417 void DPSOFTRAST_VertexShader_Lightmap(void)
3419 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3420 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3421 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3424 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3427 unsigned char * RESTRICT pixelmask = span->pixelmask;
3428 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3429 int x, startx = span->startx, endx = span->endx;
3430 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3431 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3432 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3433 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3434 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3435 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3436 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3437 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3438 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3439 if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3440 pixel = buffer_FragColorbgra8;
3441 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3442 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3443 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3444 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3445 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3446 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3447 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3448 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3450 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3451 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3452 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3453 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3454 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3455 for (x = startx;x < endx;x++)
3457 __m128i color, lightmap, glow, pix;
3458 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3461 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3462 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3463 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3464 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3465 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3466 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3467 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3468 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3469 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3470 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3476 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3477 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3478 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3479 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3480 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3481 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3486 for (x = startx;x < endx;x++)
3488 __m128i color, lightmap, pix;
3489 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3492 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3493 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3494 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3495 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3496 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3497 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3498 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3504 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3505 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3506 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3507 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3510 if (pixel == buffer_FragColorbgra8)
3511 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3516 void DPSOFTRAST_VertexShader_LightDirection(void);
3517 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3519 void DPSOFTRAST_VertexShader_FakeLight(void)
3521 DPSOFTRAST_VertexShader_LightDirection();
3524 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3526 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3531 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3533 DPSOFTRAST_VertexShader_LightDirection();
3534 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3537 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3539 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3544 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3546 DPSOFTRAST_VertexShader_LightDirection();
3547 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3550 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3552 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3557 void DPSOFTRAST_VertexShader_LightDirection(void)
3560 int numvertices = dpsoftrast.numvertices;
3562 float LightVector[4];
3563 float EyePosition[4];
3564 float EyeVectorModelSpace[4];
3570 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3571 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3572 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3573 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3574 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3575 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3576 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3577 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3578 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3579 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3580 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3581 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3582 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3583 for (i = 0;i < numvertices;i++)
3585 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3586 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3587 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3588 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3589 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3590 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3591 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3592 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3593 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3594 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3595 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3596 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3597 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3598 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3599 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3600 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3601 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3602 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3603 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3604 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3605 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3606 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3607 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3608 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3609 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3610 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3611 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3612 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3613 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3615 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3618 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3619 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3620 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3621 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3622 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3623 #define DPSOFTRAST_Vector3Normalize(v)\
3626 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3637 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3639 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3640 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3641 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3642 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3643 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3644 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3645 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3646 unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3647 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3648 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3649 int x, startx = span->startx, endx = span->endx;
3650 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3651 float LightVectordata[4];
3652 float LightVectorslope[4];
3653 float EyeVectordata[4];
3654 float EyeVectorslope[4];
3655 float VectorSdata[4];
3656 float VectorSslope[4];
3657 float VectorTdata[4];
3658 float VectorTslope[4];
3659 float VectorRdata[4];
3660 float VectorRslope[4];
3662 float diffusetex[4];
3664 float surfacenormal[4];
3665 float lightnormal[4];
3666 float lightnormal_modelspace[4];
3668 float specularnormal[4];
3671 float SpecularPower;
3673 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3674 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3675 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3676 Color_Glow[3] = 0.0f;
3677 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3678 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3679 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3680 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3681 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3682 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3683 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3684 Color_Pants[3] = 0.0f;
3685 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3686 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3687 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3688 Color_Shirt[3] = 0.0f;
3689 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3690 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3691 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3693 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3694 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3696 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3698 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3700 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3702 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3703 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3704 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3705 Color_Diffuse[3] = 0.0f;
3706 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3707 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3708 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3709 LightColor[3] = 0.0f;
3710 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3711 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3712 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3713 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3714 Color_Specular[3] = 0.0f;
3715 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3716 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3717 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3719 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3721 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3722 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3723 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3724 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3725 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3727 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3729 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3730 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3732 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3734 // nothing of this needed
3738 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3741 for (x = startx;x < endx;x++)
3744 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3745 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3746 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3747 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3748 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3750 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3751 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3752 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3753 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3755 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3756 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3757 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3758 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3759 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3760 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3761 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3762 DPSOFTRAST_Vector3Normalize(surfacenormal);
3764 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3766 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3767 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3768 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3769 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3771 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3772 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3773 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3774 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3776 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3777 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3778 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3779 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3781 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3782 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3783 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3784 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3786 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3787 DPSOFTRAST_Vector3Normalize(lightnormal);
3789 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3791 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3792 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3793 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3794 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3797 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3799 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3800 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3801 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3803 float f = 1.0f / 256.0f;
3804 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3805 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3806 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3809 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3811 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3812 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3813 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3814 DPSOFTRAST_Vector3Normalize(lightnormal);
3816 LightColor[0] = 1.0;
3817 LightColor[1] = 1.0;
3818 LightColor[2] = 1.0;
3822 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3823 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3824 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3825 DPSOFTRAST_Vector3Normalize(lightnormal);
3828 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3830 if(thread->shader_exactspecularmath)
3832 // reflect lightnormal at surfacenormal, take the negative of that
3833 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3835 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3836 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3837 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3838 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3840 // dot of this and normalize(EyeVectorFogDepth.xyz)
3841 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3842 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3843 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3844 DPSOFTRAST_Vector3Normalize(eyenormal);
3846 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3850 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3851 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3852 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3853 DPSOFTRAST_Vector3Normalize(eyenormal);
3855 specularnormal[0] = lightnormal[0] + eyenormal[0];
3856 specularnormal[1] = lightnormal[1] + eyenormal[1];
3857 specularnormal[2] = lightnormal[2] + eyenormal[2];
3858 DPSOFTRAST_Vector3Normalize(specularnormal);
3860 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < DPSOFTRAST_FLT_MIN) specular = DPSOFTRAST_FLT_MIN;
3863 specular = pow(specular, SpecularPower * glosstex[3]);
3864 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3866 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3867 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3868 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3869 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3873 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3874 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3875 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3876 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3879 buffer_FragColorbgra8[x*4+0] = d[0];
3880 buffer_FragColorbgra8[x*4+1] = d[1];
3881 buffer_FragColorbgra8[x*4+2] = d[2];
3882 buffer_FragColorbgra8[x*4+3] = d[3];
3885 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3887 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3888 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3889 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3890 Color_Diffuse[3] = 0.0f;
3891 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3892 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3893 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3894 LightColor[3] = 0.0f;
3895 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3897 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3899 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3900 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3901 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3902 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3903 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3905 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3907 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3908 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3910 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3912 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3916 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3919 for (x = startx;x < endx;x++)
3922 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3923 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3924 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3925 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3926 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3927 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3928 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3929 DPSOFTRAST_Vector3Normalize(surfacenormal);
3931 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3933 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3934 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3935 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3936 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3938 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3939 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3940 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3941 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3943 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3944 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3945 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3946 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3948 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3949 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3950 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3951 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3953 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3954 DPSOFTRAST_Vector3Normalize(lightnormal);
3956 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3958 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3959 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3960 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3961 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3964 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3966 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3967 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3968 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3970 float f = 1.0f / 256.0f;
3971 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3972 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3973 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3976 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3978 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3979 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3980 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3981 DPSOFTRAST_Vector3Normalize(lightnormal);
3983 LightColor[0] = 1.0;
3984 LightColor[1] = 1.0;
3985 LightColor[2] = 1.0;
3989 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3990 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3991 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3992 DPSOFTRAST_Vector3Normalize(lightnormal);
3995 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3996 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3998 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3999 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4000 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4001 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4005 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4006 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4007 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4008 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4010 buffer_FragColorbgra8[x*4+0] = d[0];
4011 buffer_FragColorbgra8[x*4+1] = d[1];
4012 buffer_FragColorbgra8[x*4+2] = d[2];
4013 buffer_FragColorbgra8[x*4+3] = d[3];
4018 for (x = startx;x < endx;x++)
4021 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4022 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4023 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4024 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4026 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4028 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4029 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4030 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4031 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4035 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4036 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4037 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4038 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4040 buffer_FragColorbgra8[x*4+0] = d[0];
4041 buffer_FragColorbgra8[x*4+1] = d[1];
4042 buffer_FragColorbgra8[x*4+2] = d[2];
4043 buffer_FragColorbgra8[x*4+3] = d[3];
4046 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4051 void DPSOFTRAST_VertexShader_LightSource(void)
4054 int numvertices = dpsoftrast.numvertices;
4055 float LightPosition[4];
4056 float LightVector[4];
4057 float LightVectorModelSpace[4];
4058 float EyePosition[4];
4059 float EyeVectorModelSpace[4];
4065 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4066 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4067 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4068 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4069 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4070 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4071 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4072 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4073 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4074 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4075 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4076 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4077 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4078 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4079 for (i = 0;i < numvertices;i++)
4081 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4082 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4083 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4084 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4085 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4086 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4087 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4088 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4089 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4090 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4091 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4092 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4093 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4094 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4095 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4096 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4097 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4098 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
4099 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4100 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4101 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4102 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4103 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4104 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4105 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4106 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4107 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4108 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4109 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4110 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4111 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4112 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4114 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4115 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4118 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4121 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4122 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4123 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4124 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4125 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4126 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4127 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4128 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4129 int x, startx = span->startx, endx = span->endx;
4130 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4131 float CubeVectordata[4];
4132 float CubeVectorslope[4];
4133 float LightVectordata[4];
4134 float LightVectorslope[4];
4135 float EyeVectordata[4];
4136 float EyeVectorslope[4];
4138 float diffusetex[4];
4140 float surfacenormal[4];
4141 float lightnormal[4];
4143 float specularnormal[4];
4146 float SpecularPower;
4147 float CubeVector[4];
4150 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4151 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4152 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4153 Color_Glow[3] = 0.0f;
4154 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4155 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4156 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4157 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4158 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4159 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4160 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4161 Color_Diffuse[3] = 0.0f;
4162 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4163 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4164 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4165 Color_Specular[3] = 0.0f;
4166 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4167 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4168 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4169 Color_Pants[3] = 0.0f;
4170 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4171 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4172 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4173 Color_Shirt[3] = 0.0f;
4174 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4175 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4176 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4177 LightColor[3] = 0.0f;
4178 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4179 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4180 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4181 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4182 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4183 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4184 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4185 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4187 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4188 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4190 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4191 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4192 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4194 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4195 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4196 for (x = startx;x < endx;x++)
4199 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4200 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4201 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4202 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4203 if (attenuation < 0.01f)
4205 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4207 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4208 if (attenuation < 0.01f)
4212 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4213 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4214 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4215 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4216 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4218 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4219 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4220 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4221 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4223 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4224 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4225 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4226 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4227 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4228 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4229 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4230 DPSOFTRAST_Vector3Normalize(surfacenormal);
4232 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4233 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4234 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4235 DPSOFTRAST_Vector3Normalize(lightnormal);
4237 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4239 if(thread->shader_exactspecularmath)
4241 // reflect lightnormal at surfacenormal, take the negative of that
4242 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4244 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4245 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4246 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4247 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4249 // dot of this and normalize(EyeVectorFogDepth.xyz)
4250 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4251 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4252 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4253 DPSOFTRAST_Vector3Normalize(eyenormal);
4255 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4259 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4260 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4261 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4262 DPSOFTRAST_Vector3Normalize(eyenormal);
4264 specularnormal[0] = lightnormal[0] + eyenormal[0];
4265 specularnormal[1] = lightnormal[1] + eyenormal[1];
4266 specularnormal[2] = lightnormal[2] + eyenormal[2];
4267 DPSOFTRAST_Vector3Normalize(specularnormal);
4269 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < DPSOFTRAST_FLT_MIN) specular = DPSOFTRAST_FLT_MIN;
4271 specular = pow(specular, SpecularPower * glosstex[3]);
4273 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4275 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4276 attenuation *= (1.0f / 255.0f);
4277 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4278 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4279 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4280 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4284 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4285 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4286 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4287 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4289 buffer_FragColorbgra8[x*4+0] = d[0];
4290 buffer_FragColorbgra8[x*4+1] = d[1];
4291 buffer_FragColorbgra8[x*4+2] = d[2];
4292 buffer_FragColorbgra8[x*4+3] = d[3];
4295 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4297 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4298 for (x = startx;x < endx;x++)
4301 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4302 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4303 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4304 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4305 if (attenuation < 0.01f)
4307 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4309 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4310 if (attenuation < 0.01f)
4314 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4315 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4316 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4317 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4318 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4320 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4321 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4322 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4323 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4325 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4326 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4327 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4328 DPSOFTRAST_Vector3Normalize(surfacenormal);
4330 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4331 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4332 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4333 DPSOFTRAST_Vector3Normalize(lightnormal);
4335 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4336 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4338 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4339 attenuation *= (1.0f / 255.0f);
4340 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4341 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4342 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4343 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4347 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4348 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4349 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4350 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4352 buffer_FragColorbgra8[x*4+0] = d[0];
4353 buffer_FragColorbgra8[x*4+1] = d[1];
4354 buffer_FragColorbgra8[x*4+2] = d[2];
4355 buffer_FragColorbgra8[x*4+3] = d[3];
4360 for (x = startx;x < endx;x++)
4363 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4364 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4365 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4366 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4367 if (attenuation < 0.01f)
4369 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4371 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4372 if (attenuation < 0.01f)
4376 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4377 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4378 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4379 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4380 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4382 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4383 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4384 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4385 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4387 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4389 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4390 attenuation *= (1.0f / 255.0f);
4391 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4392 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4393 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4394 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4398 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4399 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4400 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4401 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4403 buffer_FragColorbgra8[x*4+0] = d[0];
4404 buffer_FragColorbgra8[x*4+1] = d[1];
4405 buffer_FragColorbgra8[x*4+2] = d[2];
4406 buffer_FragColorbgra8[x*4+3] = d[3];
4409 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4415 void DPSOFTRAST_VertexShader_Refraction(void)
4417 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4418 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4419 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4422 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4424 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4426 int x, startx = span->startx, endx = span->endx;
4429 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4430 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4433 float ModelViewProjectionPositiondata[4];
4434 float ModelViewProjectionPositionslope[4];
4437 float ScreenScaleRefractReflect[2];
4438 float ScreenCenterRefractReflect[2];
4439 float DistortScaleRefractReflect[2];
4440 float RefractColor[4];
4442 DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4443 if(!texture) return;
4446 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4447 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4450 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4453 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4454 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4455 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4456 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4457 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4458 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4459 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4460 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4461 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4462 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4465 for (x = startx;x < endx;x++)
4467 float SafeScreenTexCoord[2];
4468 float ScreenTexCoord[2];
4475 // " vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4476 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4478 // " vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4479 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4480 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4482 // " vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4483 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4484 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4485 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4486 DPSOFTRAST_Vector3Normalize(v);
4487 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4488 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4490 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4491 DPSOFTRAST_Texture2DBGRA8(texture, 0, ScreenTexCoord[0], ScreenTexCoord[1], c);
4493 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4494 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4495 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4496 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4499 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4504 void DPSOFTRAST_VertexShader_Water(void)
4507 int numvertices = dpsoftrast.numvertices;
4508 float EyePosition[4];
4509 float EyeVectorModelSpace[4];
4515 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4516 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4517 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4518 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4519 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4520 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4521 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4522 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4523 for (i = 0;i < numvertices;i++)
4525 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4526 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4527 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4528 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4529 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4530 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4531 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4532 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4533 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4534 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4535 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4536 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4537 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4538 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4539 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4540 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4541 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4542 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4543 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
4544 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
4545 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
4546 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
4548 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4549 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4550 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4554 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4556 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4558 int x, startx = span->startx, endx = span->endx;
4561 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4562 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4565 float ModelViewProjectionPositiondata[4];
4566 float ModelViewProjectionPositionslope[4];
4567 float EyeVectordata[4];
4568 float EyeVectorslope[4];
4571 float ScreenScaleRefractReflect[4];
4572 float ScreenCenterRefractReflect[4];
4573 float DistortScaleRefractReflect[4];
4574 float RefractColor[4];
4575 float ReflectColor[4];
4576 float ReflectFactor;
4577 float ReflectOffset;
4579 DPSOFTRAST_Texture *texture_refraction = thread->texbound[GL20TU_REFRACTION];
4580 DPSOFTRAST_Texture *texture_reflection = thread->texbound[GL20TU_REFLECTION];
4581 if(!texture_refraction || !texture_reflection) return;
4584 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4585 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4588 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4589 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
4592 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4593 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4594 ScreenScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+2];
4595 ScreenScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+3];
4596 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4597 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4598 ScreenCenterRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+2];
4599 ScreenCenterRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+3];
4600 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4601 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4602 DistortScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+2];
4603 DistortScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+3];
4604 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4605 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4606 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4607 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4608 ReflectColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+2];
4609 ReflectColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+1];
4610 ReflectColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+0];
4611 ReflectColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+3];
4612 ReflectFactor = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectFactor*4+0];
4613 ReflectOffset = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectOffset*4+0];
4616 for (x = startx;x < endx;x++)
4618 float SafeScreenTexCoord[4];
4619 float ScreenTexCoord[4];
4622 unsigned char c1[4];
4623 unsigned char c2[4];
4628 // " vec4 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect * (1.0 / ModelViewProjectionPosition.w);\n"
4629 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4631 // " vec4 SafeScreenTexCoord = ModelViewProjectionPosition.xyxy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect;\n"
4632 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4633 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4634 SafeScreenTexCoord[2] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[2] + ScreenCenterRefractReflect[2]; // * z (disappears)
4635 SafeScreenTexCoord[3] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[3] + ScreenCenterRefractReflect[3]; // * z (disappears)
4637 // " vec4 ScreenTexCoord = SafeScreenTexCoord + vec2(normalize(vec3(dp_texture2D(Texture_Normal, TexCoord)) - vec3(0.5))).xyxy * DistortScaleRefractReflect;\n"
4638 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4639 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4640 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4641 DPSOFTRAST_Vector3Normalize(v);
4642 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4643 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4644 ScreenTexCoord[2] = SafeScreenTexCoord[2] + v[0] * DistortScaleRefractReflect[2];
4645 ScreenTexCoord[3] = SafeScreenTexCoord[3] + v[1] * DistortScaleRefractReflect[3];
4647 // " float Fresnel = pow(min(1.0, 1.0 - float(normalize(EyeVector).z)), 2.0) * ReflectFactor + ReflectOffset;\n"
4648 v[0] = (EyeVectordata[0] + EyeVectorslope[0] * x); // * z (disappears)
4649 v[1] = (EyeVectordata[1] + EyeVectorslope[1] * x); // * z (disappears)
4650 v[2] = (EyeVectordata[2] + EyeVectorslope[2] * x); // * z (disappears)
4651 DPSOFTRAST_Vector3Normalize(v);
4652 Fresnel = 1.0f - v[2];
4653 Fresnel = min(1.0f, Fresnel);
4654 Fresnel = Fresnel * Fresnel * ReflectFactor + ReflectOffset;
4656 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4657 // " dp_FragColor = mix(vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord.xy).rgb, 1) * RefractColor, vec4(dp_texture2D(Texture_Reflection, ScreenTexCoord.zw).rgb, 1) * ReflectColor, Fresnel);\n"
4658 DPSOFTRAST_Texture2DBGRA8(texture_refraction, 0, ScreenTexCoord[0], ScreenTexCoord[1], c1);
4659 DPSOFTRAST_Texture2DBGRA8(texture_reflection, 0, ScreenTexCoord[2], ScreenTexCoord[3], c2);
4661 buffer_FragColorbgra8[x*4+0] = (c1[0] * RefractColor[0]) * (1.0f - Fresnel) + (c2[0] * ReflectColor[0]) * Fresnel;
4662 buffer_FragColorbgra8[x*4+1] = (c1[1] * RefractColor[1]) * (1.0f - Fresnel) + (c2[1] * ReflectColor[1]) * Fresnel;
4663 buffer_FragColorbgra8[x*4+2] = (c1[2] * RefractColor[2]) * (1.0f - Fresnel) + (c2[2] * ReflectColor[2]) * Fresnel;
4664 buffer_FragColorbgra8[x*4+3] = min(( RefractColor[3] * (1.0f - Fresnel) + ReflectColor[3] * Fresnel) * 256, 255);
4667 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4672 void DPSOFTRAST_VertexShader_ShowDepth(void)
4674 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4677 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4680 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4681 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4682 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4683 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4684 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4689 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4691 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4694 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4697 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4698 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4699 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4700 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4701 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4706 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4708 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4711 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4714 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4715 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4716 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4717 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4718 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4723 typedef struct DPSOFTRAST_ShaderModeInfo_s
4726 void (*Vertex)(void);
4727 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4728 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4729 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4731 DPSOFTRAST_ShaderModeInfo;
4733 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4735 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4736 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4737 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4738 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4739 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4740 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4741 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4742 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4743 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4744 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4745 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4746 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4747 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4748 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4749 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_NORMAL, GL20TU_REFLECTION, GL20TU_REFRACTION, ~0}},
4750 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4751 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4752 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4755 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4760 unsigned int *depthpixel;
4764 unsigned char *pixelmask;
4765 DPSOFTRAST_State_Triangle *triangle;
4766 triangle = &thread->triangles[span->triangle];
4767 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4768 startx = span->startx;
4770 depth = span->depthbase;
4771 depthslope = span->depthslope;
4772 pixelmask = thread->pixelmaskarray;
4773 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4775 switch(thread->fb_depthfunc)
4778 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4779 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4780 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4781 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4782 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4783 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4784 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4786 while (startx < endx && !pixelmask[startx])
4788 while (endx > startx && !pixelmask[endx-1])
4793 // no depth testing means we're just dealing with color...
4794 memset(pixelmask + startx, 1, endx - startx);
4796 span->pixelmask = pixelmask;
4797 span->startx = startx;
4801 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4803 int x, d, depth, depthslope, startx, endx;
4804 const unsigned char *pixelmask;
4805 unsigned int *depthpixel;
4806 if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4808 depth = span->depthbase;
4809 depthslope = span->depthslope;
4810 pixelmask = span->pixelmask;
4811 startx = span->startx;
4813 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4814 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4820 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4823 DPSOFTRAST_State_Triangle *triangle;
4824 DPSOFTRAST_State_Span *span;
4825 for (i = 0; i < thread->numspans; i++)
4827 span = &thread->spans[i];
4828 triangle = &thread->triangles[span->triangle];
4829 DPSOFTRAST_Draw_DepthTest(thread, span);
4830 if (span->startx >= span->endx)
4832 // run pixel shader if appropriate
4833 // do this before running depthmask code, to allow the pixelshader
4834 // to clear pixelmask values for alpha testing
4835 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4836 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4837 DPSOFTRAST_Draw_DepthWrite(thread, span);
4839 thread->numspans = 0;
4842 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4844 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4847 int cullface = thread->cullface;
4848 int minx, maxx, miny, maxy;
4849 int miny1, maxy1, miny2, maxy2;
4850 __m128i fbmin, fbmax;
4851 __m128 viewportcenter, viewportscale;
4852 int firstvertex = command->firstvertex;
4853 int numvertices = command->numvertices;
4854 int numtriangles = command->numtriangles;
4855 const int *element3i = command->element3i;
4856 const unsigned short *element3s = command->element3s;
4857 int clipped = command->clipped;
4864 int starty, endy, bandy;
4868 float clip0origin, clip0slope;
4870 __m128 triangleedge1, triangleedge2, trianglenormal;
4873 DPSOFTRAST_State_Triangle *triangle;
4874 DPSOFTRAST_Texture *texture;
4875 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4876 miny = thread->fb_scissor[1];
4877 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4878 miny1 = bound(miny, thread->miny1, maxy);
4879 maxy1 = bound(miny, thread->maxy1, maxy);
4880 miny2 = bound(miny, thread->miny2, maxy);
4881 maxy2 = bound(miny, thread->maxy2, maxy);
4882 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4884 if (!ATOMIC_DECREMENT(command->refcount))
4886 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4887 MM_FREE(command->arrays);
4891 minx = thread->fb_scissor[0];
4892 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4893 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4894 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4895 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4896 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4897 screen[3] = _mm_setzero_ps();
4898 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4899 for (i = 0;i < numtriangles;i++)
4901 const float *screencoord4f = command->arrays;
4902 const float *arrays = screencoord4f + numvertices*4;
4904 // generate the 3 edges of this triangle
4905 // generate spans for the triangle - switch based on left split or right split classification of triangle
4908 e[0] = element3s[i*3+0] - firstvertex;
4909 e[1] = element3s[i*3+1] - firstvertex;
4910 e[2] = element3s[i*3+2] - firstvertex;
4914 e[0] = element3i[i*3+0] - firstvertex;
4915 e[1] = element3i[i*3+1] - firstvertex;
4916 e[2] = element3i[i*3+2] - firstvertex;
4925 #define SKIPBACKFACE \
4926 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4927 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4928 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4929 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4930 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4934 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4938 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4943 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4944 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4946 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4947 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4949 #define CLIPPEDVERTEXCOPY(k,p1) \
4950 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4952 #define GENATTRIBCOPY(attrib, p1) \
4953 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4954 #define GENATTRIBLERP(attrib, p1, p2) \
4956 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4957 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4959 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4963 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4964 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4965 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4966 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4967 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4968 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4969 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4975 // calculate distance from nearplane
4976 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4977 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4978 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4979 if (clipdist[0] >= 0.0f)
4981 if (clipdist[1] >= 0.0f)
4983 if (clipdist[2] >= 0.0f)
4986 // triangle is entirely in front of nearplane
4987 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4994 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
5002 if (clipdist[2] >= 0.0f)
5004 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
5011 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
5018 else if (clipdist[1] >= 0.0f)
5020 if (clipdist[2] >= 0.0f)
5022 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
5029 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
5035 else if (clipdist[2] >= 0.0f)
5037 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
5042 else continue; // triangle is entirely behind nearplane
5045 // calculate integer y coords for triangle points
5046 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
5047 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
5048 screenmin = _mm_min_epi16(screeni, screenir),
5049 screenmax = _mm_max_epi16(screeni, screenir);
5050 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
5051 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
5052 screenmin = _mm_max_epi16(screenmin, fbmin);
5053 screenmax = _mm_min_epi16(screenmax, fbmax);
5054 // skip offscreen triangles
5055 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5057 starty = _mm_extract_epi16(screenmin, 1);
5058 endy = _mm_extract_epi16(screenmax, 1)+1;
5059 if (starty >= maxy1 && endy <= miny2)
5061 screeny = _mm_srai_epi32(screeni, 16);
5064 triangle = &thread->triangles[thread->numtriangles];
5066 // calculate attribute plans for triangle data...
5067 // okay, this triangle is going to produce spans, we'd better project
5068 // the interpolants now (this is what gives perspective texturing),
5069 // this consists of simply multiplying all arrays by the W coord
5070 // (which is basically 1/Z), which will be undone per-pixel
5071 // (multiplying by Z again) to get the perspective-correct array
5074 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5075 __m128 mipedgescale, mipdensity;
5076 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5077 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5078 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5079 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5080 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5081 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5082 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5083 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5084 attribedge1 = _mm_sub_ss(w0, w1);
5085 attribedge2 = _mm_sub_ss(w2, w1);
5086 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5087 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5088 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5089 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5090 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5091 _mm_store_ss(&triangle->w[0], attribxslope);
5092 _mm_store_ss(&triangle->w[1], attribyslope);
5093 _mm_store_ss(&triangle->w[2], attriborigin);
5098 if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
5100 float cliporigin, clipxslope, clipyslope;
5101 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5102 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5103 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5104 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5105 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5106 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5107 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
5108 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
5109 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
5112 clip0origin = -cliporigin/clipxslope;
5113 clip0slope = -clipyslope/clipxslope;
5114 clip0dir = clipxslope > 0 ? 1 : -1;
5116 else if(clipyslope > 0)
5118 clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5119 clip0slope = dpsoftrast.fb_width;
5122 else if(clipyslope < 0)
5124 clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5125 clip0slope = -dpsoftrast.fb_width;
5128 else if(clip0origin < 0) continue;
5131 mipedgescale = _mm_setzero_ps();
5132 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5134 __m128 attrib0, attrib1, attrib2;
5135 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5136 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5138 arrays += numvertices*4;
5139 GENATTRIBS(attrib0, attrib1, attrib2);
5140 attriborigin = _mm_mul_ps(attrib1, w1);
5141 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5142 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5143 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5144 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5145 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5146 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5147 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5148 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5149 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5151 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5152 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5153 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5154 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5158 memset(triangle->mip, 0, sizeof(triangle->mip));
5159 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5161 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5162 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5164 texture = thread->texbound[texunit];
5165 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5167 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5168 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5169 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5170 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5171 // this will be multiplied in the texturing routine by the texture resolution
5172 y = _mm_cvtss_si32(mipdensity);
5175 y = (int)(log((float)y)*0.5f/M_LN2);
5176 if (y > texture->mipmaps - 1)
5177 y = texture->mipmaps - 1;
5178 triangle->mip[texunit] = y;
5184 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5187 __m128 xcoords, xslope;
5188 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5189 int yccmask = _mm_movemask_epi8(ycc);
5190 int edge0p, edge0n, edge1p, edge1n;
5199 case 0xFFFF: /*0000*/ y = endy; continue;
5200 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5201 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5202 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5203 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5204 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5205 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5206 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5207 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5208 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5209 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5210 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5211 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5212 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5213 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5214 case 0x0000: /*1111*/ y++; continue;
5222 case 0xFFFF: /*000*/ y = endy; continue;
5223 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5224 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5225 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5226 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5227 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5228 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5229 case 0x0000: /*111*/ y++; continue;
5232 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5233 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5234 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5235 nexty = _mm_extract_epi16(ycc, 0);
5236 if (nexty >= bandy) nexty = bandy-1;
5237 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5238 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5239 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5240 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5241 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5242 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5244 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5245 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5247 clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5248 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5250 int startx, endx, offset;
5251 startx = _mm_cvtss_si32(xcoords);
5252 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5253 if (startx < minx) startx = minx;
5254 if (endx > maxx) endx = maxx;
5255 if (startx >= endx) continue;
5263 if(endx <= clip0) continue;
5264 startx = (int)clip0;
5267 else if (endx > clip0)
5269 if(startx >= clip0) continue;
5274 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5276 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5277 span->triangle = thread->numtriangles;
5281 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5282 if (span->startx >= span->endx)
5284 wslope = triangle->w[0];
5285 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5286 span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5287 span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5288 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5289 DPSOFTRAST_Draw_ProcessSpans(thread);
5294 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5296 DPSOFTRAST_Draw_ProcessSpans(thread);
5297 thread->numtriangles = 0;
5301 if (!ATOMIC_DECREMENT(command->refcount))
5303 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5304 MM_FREE(command->arrays);
5307 if (thread->numspans > 0 || thread->numtriangles > 0)
5309 DPSOFTRAST_Draw_ProcessSpans(thread);
5310 thread->numtriangles = 0;
5315 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5319 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5320 int datasize = 2*numvertices*sizeof(float[4]);
5321 DPSOFTRAST_Command_Draw *command;
5322 unsigned char *data;
5323 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5325 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5326 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5328 datasize += numvertices*sizeof(float[4]);
5331 datasize += numtriangles*sizeof(unsigned short[3]);
5333 datasize += numtriangles*sizeof(int[3]);
5334 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5335 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5337 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5338 data = (unsigned char *)MM_CALLOC(datasize, 1);
5342 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5343 data = (unsigned char *)command + commandsize;
5345 command->firstvertex = firstvertex;
5346 command->numvertices = numvertices;
5347 command->numtriangles = numtriangles;
5348 command->arrays = (float *)data;
5349 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5350 dpsoftrast.firstvertex = firstvertex;
5351 dpsoftrast.numvertices = numvertices;
5352 dpsoftrast.screencoord4f = (float *)data;
5353 data += numvertices*sizeof(float[4]);
5354 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5355 data += numvertices*sizeof(float[4]);
5356 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5358 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5359 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5361 dpsoftrast.post_array4f[j] = (float *)data;
5362 data += numvertices*sizeof(float[4]);
5364 command->element3i = NULL;
5365 command->element3s = NULL;
5368 command->element3s = (unsigned short *)data;
5369 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5373 command->element3i = (int *)data;
5374 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5379 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5381 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5382 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5383 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5384 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5385 if (command->starty >= command->endy)
5387 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5388 MM_FREE(command->arrays);
5389 DPSOFTRAST_UndoCommand(command->commandsize);
5392 command->clipped = dpsoftrast.drawclipped;
5393 command->refcount = dpsoftrast.numthreads;
5395 if (dpsoftrast.usethreads)
5398 DPSOFTRAST_Draw_SyncCommands();
5399 for (i = 0; i < dpsoftrast.numthreads; i++)
5401 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5402 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5403 Thread_CondSignal(thread->drawcond);
5408 DPSOFTRAST_Draw_FlushThreads();
5412 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5413 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5415 thread->validate |= DPSOFTRAST_VALIDATE_FB;
5417 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5419 DPSOFTRAST_Command_SetRenderTargets *command;
5420 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5421 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5422 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5424 dpsoftrast.fb_width = width;
5425 dpsoftrast.fb_height = height;
5426 dpsoftrast.fb_depthpixels = depthpixels;
5427 dpsoftrast.fb_colorpixels[0] = colorpixels0;
5428 dpsoftrast.fb_colorpixels[1] = colorpixels1;
5429 dpsoftrast.fb_colorpixels[2] = colorpixels2;
5430 dpsoftrast.fb_colorpixels[3] = colorpixels3;
5431 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5432 command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5433 command->width = width;
5434 command->height = height;
5437 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5439 int commandoffset = thread->commandoffset;
5440 while (commandoffset != endoffset)
5442 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5443 switch (command->opcode)
5445 #define INTERPCOMMAND(name) \
5446 case DPSOFTRAST_OPCODE_##name : \
5447 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5448 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5449 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5450 commandoffset = 0; \
5452 INTERPCOMMAND(Viewport)
5453 INTERPCOMMAND(ClearColor)
5454 INTERPCOMMAND(ClearDepth)
5455 INTERPCOMMAND(ColorMask)
5456 INTERPCOMMAND(DepthTest)
5457 INTERPCOMMAND(ScissorTest)
5458 INTERPCOMMAND(Scissor)
5459 INTERPCOMMAND(BlendFunc)
5460 INTERPCOMMAND(BlendSubtract)
5461 INTERPCOMMAND(DepthMask)
5462 INTERPCOMMAND(DepthFunc)
5463 INTERPCOMMAND(DepthRange)
5464 INTERPCOMMAND(PolygonOffset)
5465 INTERPCOMMAND(CullFace)
5466 INTERPCOMMAND(SetTexture)
5467 INTERPCOMMAND(SetShader)
5468 INTERPCOMMAND(Uniform4f)
5469 INTERPCOMMAND(UniformMatrix4f)
5470 INTERPCOMMAND(Uniform1i)
5471 INTERPCOMMAND(SetRenderTargets)
5472 INTERPCOMMAND(ClipPlane)
5474 case DPSOFTRAST_OPCODE_Draw:
5475 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5476 commandoffset += command->commandsize;
5477 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5479 thread->commandoffset = commandoffset;
5482 case DPSOFTRAST_OPCODE_Reset:
5487 thread->commandoffset = commandoffset;
5490 static int DPSOFTRAST_Draw_Thread(void *data)
5492 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5493 while(thread->index >= 0)
5495 if (thread->commandoffset != dpsoftrast.drawcommand)
5497 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5501 Thread_LockMutex(thread->drawmutex);
5502 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5504 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5505 thread->starving = true;
5506 Thread_CondWait(thread->drawcond, thread->drawmutex);
5507 thread->starving = false;
5509 Thread_UnlockMutex(thread->drawmutex);
5515 static void DPSOFTRAST_Draw_FlushThreads(void)
5517 DPSOFTRAST_State_Thread *thread;
5519 DPSOFTRAST_Draw_SyncCommands();
5520 if (dpsoftrast.usethreads)
5522 for (i = 0; i < dpsoftrast.numthreads; i++)
5524 thread = &dpsoftrast.threads[i];
5525 if (thread->commandoffset != dpsoftrast.drawcommand)
5527 Thread_LockMutex(thread->drawmutex);
5528 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5529 Thread_CondSignal(thread->drawcond);
5530 Thread_UnlockMutex(thread->drawmutex);
5533 for (i = 0; i < dpsoftrast.numthreads; i++)
5535 thread = &dpsoftrast.threads[i];
5536 if (thread->commandoffset != dpsoftrast.drawcommand)
5538 Thread_LockMutex(thread->drawmutex);
5539 if (thread->commandoffset != dpsoftrast.drawcommand)
5541 thread->waiting = true;
5542 Thread_CondWait(thread->waitcond, thread->drawmutex);
5543 thread->waiting = false;
5545 Thread_UnlockMutex(thread->drawmutex);
5551 for (i = 0; i < dpsoftrast.numthreads; i++)
5553 thread = &dpsoftrast.threads[i];
5554 if (thread->commandoffset != dpsoftrast.drawcommand)
5555 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5558 dpsoftrast.commandpool.usedcommands = 0;
5561 void DPSOFTRAST_Flush(void)
5563 DPSOFTRAST_Draw_FlushThreads();
5566 void DPSOFTRAST_Finish(void)
5571 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5581 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5582 dpsoftrast.bigendian = u.b[3];
5583 dpsoftrast.fb_width = width;
5584 dpsoftrast.fb_height = height;
5585 dpsoftrast.fb_depthpixels = depthpixels;
5586 dpsoftrast.fb_colorpixels[0] = colorpixels;
5587 dpsoftrast.fb_colorpixels[1] = NULL;
5588 dpsoftrast.fb_colorpixels[1] = NULL;
5589 dpsoftrast.fb_colorpixels[1] = NULL;
5590 dpsoftrast.viewport[0] = 0;
5591 dpsoftrast.viewport[1] = 0;
5592 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5593 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5594 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5595 dpsoftrast.texture_firstfree = 1;
5596 dpsoftrast.texture_end = 1;
5597 dpsoftrast.texture_max = 0;
5598 dpsoftrast.color[0] = 1;
5599 dpsoftrast.color[1] = 1;
5600 dpsoftrast.color[2] = 1;
5601 dpsoftrast.color[3] = 1;
5602 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5603 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5604 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5605 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5606 for (i = 0; i < dpsoftrast.numthreads; i++)
5608 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5610 thread->cullface = GL_BACK;
5611 thread->colormask[0] = 1;
5612 thread->colormask[1] = 1;
5613 thread->colormask[2] = 1;
5614 thread->colormask[3] = 1;
5615 thread->blendfunc[0] = GL_ONE;
5616 thread->blendfunc[1] = GL_ZERO;
5617 thread->depthmask = true;
5618 thread->depthtest = true;
5619 thread->depthfunc = GL_LEQUAL;
5620 thread->scissortest = false;
5621 thread->viewport[0] = 0;
5622 thread->viewport[1] = 0;
5623 thread->viewport[2] = dpsoftrast.fb_width;
5624 thread->viewport[3] = dpsoftrast.fb_height;
5625 thread->scissor[0] = 0;
5626 thread->scissor[1] = 0;
5627 thread->scissor[2] = dpsoftrast.fb_width;
5628 thread->scissor[3] = dpsoftrast.fb_height;
5629 thread->depthrange[0] = 0;
5630 thread->depthrange[1] = 1;
5631 thread->polygonoffset[0] = 0;
5632 thread->polygonoffset[1] = 0;
5633 thread->clipplane[0] = 0;
5634 thread->clipplane[1] = 0;
5635 thread->clipplane[2] = 0;
5636 thread->clipplane[3] = 1;
5638 thread->numspans = 0;
5639 thread->numtriangles = 0;
5640 thread->commandoffset = 0;
5641 thread->waiting = false;
5642 thread->starving = false;
5644 thread->validate = -1;
5645 DPSOFTRAST_Validate(thread, -1);
5647 if (dpsoftrast.usethreads)
5649 thread->waitcond = Thread_CreateCond();
5650 thread->drawcond = Thread_CreateCond();
5651 thread->drawmutex = Thread_CreateMutex();
5652 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5658 void DPSOFTRAST_Shutdown(void)
5661 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5663 DPSOFTRAST_State_Thread *thread;
5664 for (i = 0; i < dpsoftrast.numthreads; i++)
5666 thread = &dpsoftrast.threads[i];
5667 Thread_LockMutex(thread->drawmutex);
5669 Thread_CondSignal(thread->drawcond);
5670 Thread_UnlockMutex(thread->drawmutex);
5671 Thread_WaitThread(thread->thread, 0);
5672 Thread_DestroyCond(thread->waitcond);
5673 Thread_DestroyCond(thread->drawcond);
5674 Thread_DestroyMutex(thread->drawmutex);
5677 for (i = 0;i < dpsoftrast.texture_end;i++)
5678 if (dpsoftrast.texture[i].bytes)
5679 MM_FREE(dpsoftrast.texture[i].bytes);
5680 if (dpsoftrast.texture)
5681 free(dpsoftrast.texture);
5682 if (dpsoftrast.threads)
5683 MM_FREE(dpsoftrast.threads);
5684 memset(&dpsoftrast, 0, sizeof(dpsoftrast));