3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 #pragma warning(disable : 4324)
14 typedef qboolean bool;
18 #define ATOMIC_SIZE 32
21 #if defined(__APPLE__)
22 #include <libkern/OSAtomic.h>
23 #define ALIGN(var) var __attribute__((__aligned__(16)))
24 #define ATOMIC(var) var __attribute__((__aligned__(32)))
25 #define MEMORY_BARRIER (_mm_sfence())
26 #define ATOMIC_COUNTER volatile int32_t
27 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30 #elif defined(__GNUC__)
31 #define ALIGN(var) var __attribute__((__aligned__(16)))
32 #define ATOMIC(var) var __attribute__((__aligned__(32)))
33 #define MEMORY_BARRIER (_mm_sfence())
34 //(__sync_synchronize())
35 #define ATOMIC_COUNTER volatile int
36 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
37 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
38 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
39 #elif defined(_MSC_VER)
40 #define ALIGN(var) __declspec(align(16)) var
41 #define ATOMIC(var) __declspec(align(32)) var
42 #define MEMORY_BARRIER (_mm_sfence())
44 #define ATOMIC_COUNTER volatile LONG
45 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
46 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
47 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
52 #define ALIGN(var) var
55 #define ATOMIC(var) var
57 #ifndef MEMORY_BARRIER
58 #define MEMORY_BARRIER ((void)0)
60 #ifndef ATOMIC_COUNTER
61 #define ATOMIC_COUNTER int
63 #ifndef ATOMIC_INCREMENT
64 #define ATOMIC_INCREMENT(counter) (++(counter))
66 #ifndef ATOMIC_DECREMENT
67 #define ATOMIC_DECREMENT(counter) (--(counter))
70 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
74 #include <emmintrin.h>
76 #if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6) && !defined(__clang__)
77 #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
80 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
82 static void *MM_CALLOC(size_t nmemb, size_t size)
84 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
85 if (ptr != NULL) memset(ptr, 0, nmemb*size);
89 #define MM_FREE _mm_free
91 #define MM_MALLOC(size) malloc(size)
92 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
96 typedef enum DPSOFTRAST_ARRAY_e
98 DPSOFTRAST_ARRAY_POSITION,
99 DPSOFTRAST_ARRAY_COLOR,
100 DPSOFTRAST_ARRAY_TEXCOORD0,
101 DPSOFTRAST_ARRAY_TEXCOORD1,
102 DPSOFTRAST_ARRAY_TEXCOORD2,
103 DPSOFTRAST_ARRAY_TEXCOORD3,
104 DPSOFTRAST_ARRAY_TEXCOORD4,
105 DPSOFTRAST_ARRAY_TEXCOORD5,
106 DPSOFTRAST_ARRAY_TEXCOORD6,
107 DPSOFTRAST_ARRAY_TEXCOORD7,
108 DPSOFTRAST_ARRAY_TOTAL
112 typedef struct DPSOFTRAST_Texture_s
119 DPSOFTRAST_TEXTURE_FILTER filter;
122 ATOMIC_COUNTER binds;
123 unsigned char *bytes;
124 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
128 #define COMMAND_SIZE ALIGN_SIZE
129 #define COMMAND_ALIGN(var) ALIGN(var)
131 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
133 unsigned char opcode;
134 unsigned short commandsize;
138 enum { DPSOFTRAST_OPCODE_Reset = 0 };
140 #define DEFCOMMAND(opcodeval, name, fields) \
141 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
142 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
144 unsigned char opcode; \
145 unsigned short commandsize; \
147 } DPSOFTRAST_Command_##name );
149 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
150 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
152 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
156 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
158 DPSOFTRAST_State_Command_Pool);
160 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
162 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
164 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
166 DPSOFTRAST_State_Triangle);
168 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
169 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
170 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
171 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
172 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
174 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
175 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
176 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
177 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
178 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
179 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
180 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
181 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
182 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
185 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
187 typedef ALIGN(struct DPSOFTRAST_State_Span_s
189 int triangle; // triangle this span was generated by
190 int x; // framebuffer x coord
191 int y; // framebuffer y coord
192 int startx; // usable range (according to pixelmask)
193 int endx; // usable range (according to pixelmask)
194 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
195 int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
196 int depthslope; // depthbuffer value pixel delta
198 DPSOFTRAST_State_Span);
200 #define DPSOFTRAST_DRAW_MAXSPANS 1024
201 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
202 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
204 #define DPSOFTRAST_VALIDATE_FB 1
205 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
206 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
207 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
209 typedef enum DPSOFTRAST_BLENDMODE_e
211 DPSOFTRAST_BLENDMODE_OPAQUE,
212 DPSOFTRAST_BLENDMODE_ALPHA,
213 DPSOFTRAST_BLENDMODE_ADDALPHA,
214 DPSOFTRAST_BLENDMODE_ADD,
215 DPSOFTRAST_BLENDMODE_INVMOD,
216 DPSOFTRAST_BLENDMODE_MUL,
217 DPSOFTRAST_BLENDMODE_MUL2,
218 DPSOFTRAST_BLENDMODE_SUBALPHA,
219 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
220 DPSOFTRAST_BLENDMODE_INVADD,
221 DPSOFTRAST_BLENDMODE_TOTAL
223 DPSOFTRAST_BLENDMODE;
225 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
244 float polygonoffset[2];
246 ALIGN(float fb_clipplane[4]);
249 int shader_permutation;
250 int shader_exactspecularmath;
252 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
254 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
255 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
257 // DPSOFTRAST_VALIDATE_ flags
260 // derived values (DPSOFTRAST_VALIDATE_FB)
263 ALIGN(float fb_viewportcenter[4]);
264 ALIGN(float fb_viewportscale[4]);
266 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
269 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
278 ATOMIC(volatile int commandoffset);
280 volatile bool waiting;
281 volatile bool starving;
288 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
289 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
290 unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
292 DPSOFTRAST_State_Thread);
294 typedef ATOMIC(struct DPSOFTRAST_State_s
298 unsigned int *fb_depthpixels;
299 unsigned int *fb_colorpixels[4];
302 ALIGN(float fb_viewportcenter[4]);
303 ALIGN(float fb_viewportscale[4]);
306 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
307 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
309 const float *pointer_vertex3f;
310 const float *pointer_color4f;
311 const unsigned char *pointer_color4ub;
312 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
315 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
316 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
317 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
321 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
322 float *screencoord4f;
328 int shader_permutation;
329 int shader_exactspecularmath;
333 int texture_firstfree;
334 DPSOFTRAST_Texture *texture;
339 const char *errorstring;
344 DPSOFTRAST_State_Thread *threads;
346 ATOMIC(volatile int drawcommand);
348 DPSOFTRAST_State_Command_Pool commandpool;
352 DPSOFTRAST_State dpsoftrast;
354 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
355 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
356 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
357 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
359 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
360 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
362 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
364 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
365 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
366 fb_viewportcenter[3] = 0.5f;
367 fb_viewportcenter[0] = 0.0f;
368 fb_viewportscale[1] = 0.5f * viewport[2];
369 fb_viewportscale[2] = -0.5f * viewport[3];
370 fb_viewportscale[3] = 0.5f;
371 fb_viewportscale[0] = 1.0f;
374 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
376 if (dpsoftrast.interlace)
378 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
379 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
380 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
381 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
385 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
386 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
390 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
392 thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
393 thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
394 thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
395 thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
396 thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
399 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
401 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
402 // and viewport projection values
405 x1 = thread->scissor[0];
406 x2 = thread->scissor[0] + thread->scissor[2];
407 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
408 y2 = dpsoftrast.fb_height - thread->scissor[1];
409 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
411 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
413 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
414 thread->fb_scissor[0] = x1;
415 thread->fb_scissor[1] = y1;
416 thread->fb_scissor[2] = x2 - x1;
417 thread->fb_scissor[3] = y2 - y1;
419 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
420 DPSOFTRAST_RecalcClipPlane(thread);
421 DPSOFTRAST_RecalcThread(thread);
424 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
426 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
429 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
431 if (thread->blendsubtract)
433 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
435 #define BLENDFUNC(sfactor, dfactor, blendmode) \
436 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
437 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
438 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
443 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
445 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
446 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
447 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
448 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
449 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
450 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
451 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
452 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
453 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
454 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
455 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
460 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
462 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
464 mask &= thread->validate;
467 if (mask & DPSOFTRAST_VALIDATE_FB)
469 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
470 DPSOFTRAST_RecalcFB(thread);
472 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
474 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
475 DPSOFTRAST_RecalcDepthFunc(thread);
477 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
479 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
480 DPSOFTRAST_RecalcBlendFunc(thread);
484 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
486 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
487 return &dpsoftrast.texture[index];
491 static void DPSOFTRAST_Texture_Grow(void)
493 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
494 DPSOFTRAST_State_Thread *thread;
498 // expand texture array as needed
499 if (dpsoftrast.texture_max < 1024)
500 dpsoftrast.texture_max = 1024;
502 dpsoftrast.texture_max *= 2;
503 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
504 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
505 if (dpsoftrast.texbound[i])
506 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
507 for (j = 0; j < dpsoftrast.numthreads; j++)
509 thread = &dpsoftrast.threads[j];
510 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
511 if (thread->texbound[i])
512 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
516 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
525 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
526 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
527 DPSOFTRAST_Texture *texture;
528 if (width*height*depth < 1)
530 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
533 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
535 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
540 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
541 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
542 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
544 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
545 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
547 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
552 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
555 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
557 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
562 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
564 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
567 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
569 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
572 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
574 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
577 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
579 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
582 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
584 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
587 // find first empty slot in texture array
588 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
589 if (!dpsoftrast.texture[texnum].bytes)
591 dpsoftrast.texture_firstfree = texnum + 1;
592 if (dpsoftrast.texture_max <= texnum)
593 DPSOFTRAST_Texture_Grow();
594 if (dpsoftrast.texture_end <= texnum)
595 dpsoftrast.texture_end = texnum + 1;
596 texture = &dpsoftrast.texture[texnum];
597 memset(texture, 0, sizeof(*texture));
598 texture->flags = flags;
599 texture->width = width;
600 texture->height = height;
601 texture->depth = depth;
602 texture->sides = sides;
614 s = w * h * d * sides * 4;
615 texture->mipmap[mipmaps][0] = size;
616 texture->mipmap[mipmaps][1] = s;
617 texture->mipmap[mipmaps][2] = w;
618 texture->mipmap[mipmaps][3] = h;
619 texture->mipmap[mipmaps][4] = d;
622 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
628 texture->mipmaps = mipmaps;
629 texture->size = size;
631 // allocate the pixels now
632 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
636 void DPSOFTRAST_Texture_Free(int index)
638 DPSOFTRAST_Texture *texture;
639 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
643 MM_FREE(texture->bytes);
644 texture->bytes = NULL;
645 memset(texture, 0, sizeof(*texture));
646 // adjust the free range and used range
647 if (dpsoftrast.texture_firstfree > index)
648 dpsoftrast.texture_firstfree = index;
649 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
650 dpsoftrast.texture_end--;
652 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
654 int i, x, y, z, w, layer0, layer1, row0, row1;
655 unsigned char *o, *i0, *i1, *i2, *i3;
656 DPSOFTRAST_Texture *texture;
657 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
658 if (texture->mipmaps <= 1)
660 for (i = 1;i < texture->mipmaps;i++)
662 for (z = 0;z < texture->mipmap[i][4];z++)
666 if (layer1 >= texture->mipmap[i-1][4])
667 layer1 = texture->mipmap[i-1][4]-1;
668 for (y = 0;y < texture->mipmap[i][3];y++)
672 if (row1 >= texture->mipmap[i-1][3])
673 row1 = texture->mipmap[i-1][3]-1;
674 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
675 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
676 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
677 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
678 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
679 w = texture->mipmap[i][2];
682 if (texture->mipmap[i-1][2] > 1)
684 // average 3D texture
685 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
687 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
688 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
689 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
690 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
695 // average 3D mipmap with parent width == 1
696 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
698 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
699 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
700 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
701 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
707 if (texture->mipmap[i-1][2] > 1)
709 // average 2D texture (common case)
710 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
712 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
713 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
714 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
715 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
720 // 2D texture with parent width == 1
721 o[0] = (i0[0] + i1[0] + 1) >> 1;
722 o[1] = (i0[1] + i1[1] + 1) >> 1;
723 o[2] = (i0[2] + i1[2] + 1) >> 1;
724 o[3] = (i0[3] + i1[3] + 1) >> 1;
731 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
733 DPSOFTRAST_Texture *texture;
735 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
740 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
741 while (blockheight > 0)
743 memcpy(dst, pixels, blockwidth * 4);
744 pixels += blockwidth * 4;
745 dst += texture->mipmap[0][2] * 4;
749 DPSOFTRAST_Texture_CalculateMipmaps(index);
751 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
753 DPSOFTRAST_Texture *texture;
754 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
758 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
759 DPSOFTRAST_Texture_CalculateMipmaps(index);
761 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
763 DPSOFTRAST_Texture *texture;
764 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
765 return texture->mipmap[mip][2];
767 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
769 DPSOFTRAST_Texture *texture;
770 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
771 return texture->mipmap[mip][3];
773 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
775 DPSOFTRAST_Texture *texture;
776 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
777 return texture->mipmap[mip][4];
779 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
781 DPSOFTRAST_Texture *texture;
782 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
785 return texture->bytes + texture->mipmap[mip][0];
787 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
789 DPSOFTRAST_Texture *texture;
790 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
791 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
793 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
798 texture->filter = filter;
801 static void DPSOFTRAST_Draw_FlushThreads(void);
803 static void DPSOFTRAST_Draw_SyncCommands(void)
805 if(dpsoftrast.usethreads) MEMORY_BARRIER;
806 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
809 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
811 DPSOFTRAST_State_Thread *thread;
813 int freecommand = dpsoftrast.commandpool.freecommand;
814 int usedcommands = dpsoftrast.commandpool.usedcommands;
815 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
817 DPSOFTRAST_Draw_SyncCommands();
823 for (i = 0; i < dpsoftrast.numthreads; i++)
825 thread = &dpsoftrast.threads[i];
826 commandoffset = freecommand - thread->commandoffset;
827 if (commandoffset < 0)
828 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
829 if (commandoffset > usedcommands)
832 usedcommands = commandoffset;
835 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
837 thread = &dpsoftrast.threads[waitindex];
838 Thread_LockMutex(thread->drawmutex);
839 if (thread->commandoffset != dpsoftrast.drawcommand)
841 thread->waiting = true;
842 if (thread->starving) Thread_CondSignal(thread->drawcond);
843 Thread_CondWait(thread->waitcond, thread->drawmutex);
844 thread->waiting = false;
846 Thread_UnlockMutex(thread->drawmutex);
848 dpsoftrast.commandpool.usedcommands = usedcommands;
851 #define DPSOFTRAST_ALIGNCOMMAND(size) \
852 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
853 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
854 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
856 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
858 DPSOFTRAST_Command *command;
859 int freecommand = dpsoftrast.commandpool.freecommand;
860 int usedcommands = dpsoftrast.commandpool.usedcommands;
861 int extra = sizeof(DPSOFTRAST_Command);
862 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
863 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
864 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
866 if (dpsoftrast.usethreads)
867 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
869 DPSOFTRAST_Draw_FlushThreads();
870 freecommand = dpsoftrast.commandpool.freecommand;
871 usedcommands = dpsoftrast.commandpool.usedcommands;
873 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
875 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
876 command->opcode = DPSOFTRAST_OPCODE_Reset;
877 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
880 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
881 command->opcode = opcode;
882 command->commandsize = size;
884 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
886 dpsoftrast.commandpool.freecommand = freecommand;
887 dpsoftrast.commandpool.usedcommands = usedcommands + size;
891 static void DPSOFTRAST_UndoCommand(int size)
893 int freecommand = dpsoftrast.commandpool.freecommand;
894 int usedcommands = dpsoftrast.commandpool.usedcommands;
897 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
898 usedcommands -= size;
899 dpsoftrast.commandpool.freecommand = freecommand;
900 dpsoftrast.commandpool.usedcommands = usedcommands;
903 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
904 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
906 thread->viewport[0] = command->x;
907 thread->viewport[1] = command->y;
908 thread->viewport[2] = command->width;
909 thread->viewport[3] = command->height;
910 thread->validate |= DPSOFTRAST_VALIDATE_FB;
912 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
914 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
917 command->width = width;
918 command->height = height;
920 dpsoftrast.viewport[0] = x;
921 dpsoftrast.viewport[1] = y;
922 dpsoftrast.viewport[2] = width;
923 dpsoftrast.viewport[3] = height;
924 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
927 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
928 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
930 int i, x1, y1, x2, y2, w, h, x, y;
931 int miny1, maxy1, miny2, maxy2;
935 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
936 miny1 = thread->miny1;
937 maxy1 = thread->maxy1;
938 miny2 = thread->miny2;
939 maxy2 = thread->maxy2;
940 x1 = thread->fb_scissor[0];
941 y1 = thread->fb_scissor[1];
942 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
943 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
944 if (y1 < miny1) y1 = miny1;
945 if (y2 > maxy2) y2 = maxy2;
950 // FIXME: honor fb_colormask?
951 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
952 for (i = 0;i < 4;i++)
954 if (!dpsoftrast.fb_colorpixels[i])
956 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
959 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
960 for (x = x1;x < x2;x++)
965 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
967 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
974 DEFCOMMAND(3, ClearDepth, float depth;)
975 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
977 int x1, y1, x2, y2, w, h, x, y;
978 int miny1, maxy1, miny2, maxy2;
982 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
983 miny1 = thread->miny1;
984 maxy1 = thread->maxy1;
985 miny2 = thread->miny2;
986 maxy2 = thread->maxy2;
987 x1 = thread->fb_scissor[0];
988 y1 = thread->fb_scissor[1];
989 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
990 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
991 if (y1 < miny1) y1 = miny1;
992 if (y2 > maxy2) y2 = maxy2;
997 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
998 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
1001 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
1002 for (x = x1;x < x2;x++)
1006 void DPSOFTRAST_ClearDepth(float d)
1008 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1012 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1013 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1015 thread->colormask[0] = command->r != 0;
1016 thread->colormask[1] = command->g != 0;
1017 thread->colormask[2] = command->b != 0;
1018 thread->colormask[3] = command->a != 0;
1019 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1021 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1023 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1030 DEFCOMMAND(5, DepthTest, int enable;)
1031 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1033 thread->depthtest = command->enable;
1034 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1036 void DPSOFTRAST_DepthTest(int enable)
1038 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1039 command->enable = enable;
1042 DEFCOMMAND(6, ScissorTest, int enable;)
1043 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1045 thread->scissortest = command->enable;
1046 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1048 void DPSOFTRAST_ScissorTest(int enable)
1050 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1051 command->enable = enable;
1054 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1055 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1057 thread->scissor[0] = command->x;
1058 thread->scissor[1] = command->y;
1059 thread->scissor[2] = command->width;
1060 thread->scissor[3] = command->height;
1061 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1063 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1065 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1068 command->width = width;
1069 command->height = height;
1072 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1073 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1075 thread->blendfunc[0] = command->sfactor;
1076 thread->blendfunc[1] = command->dfactor;
1077 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1079 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1081 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1082 command->sfactor = sfactor;
1083 command->dfactor = dfactor;
1086 DEFCOMMAND(9, BlendSubtract, int enable;)
1087 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1089 thread->blendsubtract = command->enable;
1090 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1092 void DPSOFTRAST_BlendSubtract(int enable)
1094 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1095 command->enable = enable;
1098 DEFCOMMAND(10, DepthMask, int enable;)
1099 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1101 thread->depthmask = command->enable;
1103 void DPSOFTRAST_DepthMask(int enable)
1105 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1106 command->enable = enable;
1109 DEFCOMMAND(11, DepthFunc, int func;)
1110 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1112 thread->depthfunc = command->func;
1114 void DPSOFTRAST_DepthFunc(int func)
1116 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1117 command->func = func;
1120 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1121 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1123 thread->depthrange[0] = command->nearval;
1124 thread->depthrange[1] = command->farval;
1126 void DPSOFTRAST_DepthRange(float nearval, float farval)
1128 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1129 command->nearval = nearval;
1130 command->farval = farval;
1133 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1134 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1136 thread->polygonoffset[0] = command->alongnormal;
1137 thread->polygonoffset[1] = command->intoview;
1139 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1141 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1142 command->alongnormal = alongnormal;
1143 command->intoview = intoview;
1146 DEFCOMMAND(14, CullFace, int mode;)
1147 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1149 thread->cullface = command->mode;
1151 void DPSOFTRAST_CullFace(int mode)
1153 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1154 command->mode = mode;
1157 DEFCOMMAND(15, AlphaTest, int enable;)
1158 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1160 thread->alphatest = command->enable;
1162 void DPSOFTRAST_AlphaTest(int enable)
1164 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1165 command->enable = enable;
1168 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1169 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1171 thread->alphafunc = command->func;
1172 thread->alphavalue = command->ref;
1174 void DPSOFTRAST_AlphaFunc(int func, float ref)
1176 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1177 command->func = func;
1181 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1183 dpsoftrast.color[0] = r;
1184 dpsoftrast.color[1] = g;
1185 dpsoftrast.color[2] = b;
1186 dpsoftrast.color[3] = a;
1189 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1191 int outstride = blockwidth * 4;
1192 int instride = dpsoftrast.fb_width * 4;
1195 int bx2 = blockx + blockwidth;
1196 int by2 = blocky + blockheight;
1200 unsigned char *inpixels;
1204 if (bx1 < 0) bx1 = 0;
1205 if (by1 < 0) by1 = 0;
1206 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1207 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1209 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1210 if (dpsoftrast.bigendian)
1212 for (y = by1;y < by2;y++)
1214 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1215 o = (unsigned char *)outpixels + (y - by1) * outstride;
1216 for (x = bx1;x < bx2;x++)
1229 for (y = by1;y < by2;y++)
1231 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1232 o = (unsigned char *)outpixels + (y - by1) * outstride;
1238 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1242 int tx2 = tx + width;
1243 int ty2 = ty + height;
1246 int sx2 = sx + width;
1247 int sy2 = sy + height;
1257 unsigned int *spixels;
1258 unsigned int *tpixels;
1259 DPSOFTRAST_Texture *texture;
1260 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1261 if (mip < 0 || mip >= texture->mipmaps) return;
1263 spixels = dpsoftrast.fb_colorpixels[0];
1264 swidth = dpsoftrast.fb_width;
1265 sheight = dpsoftrast.fb_height;
1266 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1267 twidth = texture->mipmap[mip][2];
1268 theight = texture->mipmap[mip][3];
1269 if (tx1 < 0) tx1 = 0;
1270 if (ty1 < 0) ty1 = 0;
1271 if (tx2 > twidth) tx2 = twidth;
1272 if (ty2 > theight) ty2 = theight;
1273 if (sx1 < 0) sx1 = 0;
1274 if (sy1 < 0) sy1 = 0;
1275 if (sx2 > swidth) sx2 = swidth;
1276 if (sy2 > sheight) sy2 = sheight;
1281 if (tw > sw) tw = sw;
1282 if (th > sh) th = sh;
1283 if (tw < 1 || th < 1)
1285 sy1 = sheight - 1 - sy1;
1286 for (y = 0;y < th;y++)
1287 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1288 if (texture->mipmaps > 1)
1289 DPSOFTRAST_Texture_CalculateMipmaps(index);
1292 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1293 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1295 if (thread->texbound[command->unitnum])
1296 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1297 thread->texbound[command->unitnum] = command->texture;
1299 void DPSOFTRAST_SetTexture(int unitnum, int index)
1301 DPSOFTRAST_Command_SetTexture *command;
1302 DPSOFTRAST_Texture *texture;
1303 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1305 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1308 texture = DPSOFTRAST_Texture_GetByIndex(index);
1309 if (index && !texture)
1311 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1315 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1316 command->unitnum = unitnum;
1317 command->texture = texture;
1319 dpsoftrast.texbound[unitnum] = texture;
1320 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1323 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1325 dpsoftrast.pointer_vertex3f = vertex3f;
1326 dpsoftrast.stride_vertex = stride;
1328 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1330 dpsoftrast.pointer_color4f = color4f;
1331 dpsoftrast.pointer_color4ub = NULL;
1332 dpsoftrast.stride_color = stride;
1334 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1336 dpsoftrast.pointer_color4f = NULL;
1337 dpsoftrast.pointer_color4ub = color4ub;
1338 dpsoftrast.stride_color = stride;
1340 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1342 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1343 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1344 dpsoftrast.stride_texcoord[unitnum] = stride;
1347 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1348 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1350 thread->shader_mode = command->mode;
1351 thread->shader_permutation = command->permutation;
1352 thread->shader_exactspecularmath = command->exactspecularmath;
1354 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1356 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1357 command->mode = mode;
1358 command->permutation = permutation;
1359 command->exactspecularmath = exactspecularmath;
1361 dpsoftrast.shader_mode = mode;
1362 dpsoftrast.shader_permutation = permutation;
1363 dpsoftrast.shader_exactspecularmath = exactspecularmath;
1366 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1367 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1369 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1371 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1373 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1374 command->index = index;
1375 command->val[0] = v0;
1376 command->val[1] = v1;
1377 command->val[2] = v2;
1378 command->val[3] = v3;
1380 dpsoftrast.uniform4f[index*4+0] = v0;
1381 dpsoftrast.uniform4f[index*4+1] = v1;
1382 dpsoftrast.uniform4f[index*4+2] = v2;
1383 dpsoftrast.uniform4f[index*4+3] = v3;
1385 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1387 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1388 command->index = index;
1389 memcpy(command->val, v, sizeof(command->val));
1391 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1394 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1395 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1397 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1399 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1403 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1405 __m128 m0, m1, m2, m3;
1406 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1407 command->index = (DPSOFTRAST_UNIFORM)index;
1408 if (((size_t)v)&(ALIGN_SIZE-1))
1410 m0 = _mm_loadu_ps(v);
1411 m1 = _mm_loadu_ps(v+4);
1412 m2 = _mm_loadu_ps(v+8);
1413 m3 = _mm_loadu_ps(v+12);
1417 m0 = _mm_load_ps(v);
1418 m1 = _mm_load_ps(v+4);
1419 m2 = _mm_load_ps(v+8);
1420 m3 = _mm_load_ps(v+12);
1424 __m128 t0, t1, t2, t3;
1425 t0 = _mm_unpacklo_ps(m0, m1);
1426 t1 = _mm_unpacklo_ps(m2, m3);
1427 t2 = _mm_unpackhi_ps(m0, m1);
1428 t3 = _mm_unpackhi_ps(m2, m3);
1429 m0 = _mm_movelh_ps(t0, t1);
1430 m1 = _mm_movehl_ps(t1, t0);
1431 m2 = _mm_movelh_ps(t2, t3);
1432 m3 = _mm_movehl_ps(t3, t2);
1434 _mm_store_ps(command->val, m0);
1435 _mm_store_ps(command->val+4, m1);
1436 _mm_store_ps(command->val+8, m2);
1437 _mm_store_ps(command->val+12, m3);
1438 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1439 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1440 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1441 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1446 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1447 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1449 thread->uniform1i[command->index] = command->val;
1451 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1453 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1454 command->index = index;
1457 dpsoftrast.uniform1i[command->index] = i0;
1460 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1461 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1463 memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1464 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1466 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1468 DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1469 command->clipplane[0] = x;
1470 command->clipplane[1] = y;
1471 command->clipplane[2] = z;
1472 command->clipplane[3] = w;
1476 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1478 float *end = dst + size*4;
1479 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1483 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1492 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1499 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1501 float *end = dst + size*4;
1502 if (stride == sizeof(float[3]))
1504 float *end4 = dst + (size&~3)*4;
1505 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1509 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1510 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1511 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1512 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1513 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1514 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1515 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1516 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1517 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1518 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1519 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1520 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1521 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1523 src += 4*sizeof(float[3]);
1530 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1531 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1532 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1533 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1534 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1535 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1536 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1537 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1538 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1539 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1540 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1541 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1542 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1544 src += 4*sizeof(float[3]);
1548 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1552 __m128 v = _mm_loadu_ps((const float *)src);
1553 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1554 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1555 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1556 _mm_store_ps(dst, v);
1565 __m128 v = _mm_load_ps((const float *)src);
1566 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1567 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1568 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1569 _mm_store_ps(dst, v);
1576 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1578 float *end = dst + size*4;
1579 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1580 if (stride == sizeof(float[2]))
1582 float *end2 = dst + (size&~1)*4;
1583 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1587 __m128 v = _mm_loadu_ps((const float *)src);
1588 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1589 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1591 src += 2*sizeof(float[2]);
1598 __m128 v = _mm_load_ps((const float *)src);
1599 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1600 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1602 src += 2*sizeof(float[2]);
1608 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1614 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1616 float *end = dst + size*4;
1617 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1618 if (stride == sizeof(unsigned char[4]))
1620 float *end4 = dst + (size&~3)*4;
1621 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1625 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1626 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1627 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1628 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1629 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1631 src += 4*sizeof(unsigned char[4]);
1638 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1639 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1640 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1641 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1642 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1644 src += 4*sizeof(unsigned char[4]);
1650 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1651 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1657 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1659 float *end = dst + 4*size;
1660 __m128 v = _mm_loadu_ps(src);
1663 _mm_store_ps(dst, v);
1669 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1672 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1673 __m128 m0, m1, m2, m3;
1675 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1677 // fast case for identity matrix
1678 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1681 end = out4f + numitems*4;
1682 m0 = _mm_loadu_ps(inmatrix16f);
1683 m1 = _mm_loadu_ps(inmatrix16f + 4);
1684 m2 = _mm_loadu_ps(inmatrix16f + 8);
1685 m3 = _mm_loadu_ps(inmatrix16f + 12);
1686 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1690 __m128 v = _mm_loadu_ps(in4f);
1692 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1693 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1694 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1695 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1704 __m128 v = _mm_load_ps(in4f);
1706 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1707 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1708 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1709 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1717 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1719 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1723 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1725 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1726 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1727 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1728 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1731 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1733 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1734 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1735 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1736 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1739 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1742 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1743 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1744 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1745 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1748 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1750 int clipmask = 0xFF;
1751 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1752 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1753 __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1754 __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1755 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1756 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1757 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1758 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1759 #define BBFRONT(k, pos) \
1761 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1762 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1763 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1766 clipmask &= ~(1<<k); \
1767 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1768 minproj = _mm_min_ss(minproj, proj); \
1769 maxproj = _mm_max_ss(maxproj, proj); \
1773 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1774 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1775 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1776 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1777 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1778 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1782 if (clipmask&(1<<k)) \
1784 if (!(clipmask&(1<<(k^1)))) \
1786 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1787 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1788 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1789 minproj = _mm_min_ss(minproj, proj); \
1790 maxproj = _mm_max_ss(maxproj, proj); \
1792 if (!(clipmask&(1<<(k^2)))) \
1794 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1795 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1796 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1797 minproj = _mm_min_ss(minproj, proj); \
1798 maxproj = _mm_max_ss(maxproj, proj); \
1800 if (!(clipmask&(1<<(k^4)))) \
1802 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1803 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1804 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1805 minproj = _mm_min_ss(minproj, proj); \
1806 maxproj = _mm_max_ss(maxproj, proj); \
1810 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1811 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1812 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1813 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1814 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1815 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1816 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1817 *starty = _mm_cvttss_si32(maxproj);
1818 *endy = _mm_cvttss_si32(minproj)+1;
1822 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1824 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1825 float *end = out4f + numitems*4;
1826 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1827 __m128 minpos, maxpos;
1828 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1830 minpos = maxpos = _mm_loadu_ps(in4f);
1833 __m128 v = _mm_loadu_ps(in4f);
1834 minpos = _mm_min_ps(minpos, v);
1835 maxpos = _mm_max_ps(maxpos, v);
1836 _mm_store_ps(out4f, v);
1837 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1838 _mm_store_ps(screen4f, v);
1846 minpos = maxpos = _mm_load_ps(in4f);
1849 __m128 v = _mm_load_ps(in4f);
1850 minpos = _mm_min_ps(minpos, v);
1851 maxpos = _mm_max_ps(maxpos, v);
1852 _mm_store_ps(out4f, v);
1853 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1854 _mm_store_ps(screen4f, v);
1862 ALIGN(float minposf[4]);
1863 ALIGN(float maxposf[4]);
1864 _mm_store_ps(minposf, minpos);
1865 _mm_store_ps(maxposf, maxpos);
1866 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1871 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1873 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1874 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1876 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1877 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1878 end = out4f + numitems*4;
1879 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1880 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1881 m0 = _mm_loadu_ps(inmatrix16f);
1882 m1 = _mm_loadu_ps(inmatrix16f + 4);
1883 m2 = _mm_loadu_ps(inmatrix16f + 8);
1884 m3 = _mm_loadu_ps(inmatrix16f + 12);
1885 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1887 minpos = maxpos = _mm_loadu_ps(in4f);
1890 __m128 v = _mm_loadu_ps(in4f);
1891 minpos = _mm_min_ps(minpos, v);
1892 maxpos = _mm_max_ps(maxpos, v);
1893 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1894 _mm_store_ps(out4f, v);
1895 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1896 _mm_store_ps(screen4f, v);
1904 minpos = maxpos = _mm_load_ps(in4f);
1907 __m128 v = _mm_load_ps(in4f);
1908 minpos = _mm_min_ps(minpos, v);
1909 maxpos = _mm_max_ps(maxpos, v);
1910 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1911 _mm_store_ps(out4f, v);
1912 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1913 _mm_store_ps(screen4f, v);
1921 ALIGN(float minposf[4]);
1922 ALIGN(float maxposf[4]);
1923 _mm_store_ps(minposf, minpos);
1924 _mm_store_ps(maxposf, maxpos);
1925 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f);
1931 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1934 float *outf = dpsoftrast.post_array4f[outarray];
1935 const unsigned char *inb;
1936 int firstvertex = dpsoftrast.firstvertex;
1937 int numvertices = dpsoftrast.numvertices;
1941 case DPSOFTRAST_ARRAY_POSITION:
1942 stride = dpsoftrast.stride_vertex;
1943 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1944 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1946 case DPSOFTRAST_ARRAY_COLOR:
1947 stride = dpsoftrast.stride_color;
1948 if (dpsoftrast.pointer_color4f)
1950 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1951 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1953 else if (dpsoftrast.pointer_color4ub)
1955 stride = dpsoftrast.stride_color;
1956 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1957 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1961 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1965 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1966 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1968 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1969 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1972 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1975 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1978 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1990 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1992 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1993 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1998 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
2001 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2002 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
2010 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2013 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2014 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2021 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2024 int startx = span->startx;
2025 int endx = span->endx;
2026 float wslope = triangle->w[0];
2027 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2028 float endz = 1.0f / (w + wslope * startx);
2029 if (triangle->w[0] == 0)
2031 // LordHavoc: fast flat polygons (HUD/menu)
2032 for (x = startx;x < endx;x++)
2036 for (x = startx;x < endx;)
2038 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2040 if (nextsub >= endx) nextsub = endsub = endx-1;
2041 endz = 1.0f / (w + wslope * nextsub);
2042 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2043 for (; x <= endsub; x++, z += dz)
2048 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2052 int startx = span->startx;
2053 int endx = span->endx;
2056 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2057 unsigned char * RESTRICT pixelmask = span->pixelmask;
2058 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2059 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2062 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2063 pixeli += span->y * dpsoftrast.fb_width + span->x;
2064 // handle alphatest now (this affects depth writes too)
2065 if (thread->alphatest)
2066 for (x = startx;x < endx;x++)
2067 if (in4ub[x*4+3] < 128)
2068 pixelmask[x] = false;
2069 // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2070 // helps sprites, text and hud artwork
2071 switch(thread->fb_blendmode)
2073 case DPSOFTRAST_BLENDMODE_ALPHA:
2074 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2075 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2077 for (x = startx;x < endx;x++)
2079 if (in4ub[x*4+3] >= 1)
2084 while (++x < endx && in4ub[x*4+3] >= 1) ;
2086 if (x >= endx) break;
2088 while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2089 if (x >= endx) break;
2096 case DPSOFTRAST_BLENDMODE_OPAQUE:
2097 case DPSOFTRAST_BLENDMODE_ADD:
2098 case DPSOFTRAST_BLENDMODE_INVMOD:
2099 case DPSOFTRAST_BLENDMODE_MUL:
2100 case DPSOFTRAST_BLENDMODE_MUL2:
2101 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2102 case DPSOFTRAST_BLENDMODE_INVADD:
2105 // put some special values at the end of the mask to ensure the loops end
2106 pixelmask[endx] = 1;
2107 pixelmask[endx+1] = 0;
2108 // LordHavoc: use a double loop to identify subspans, this helps the
2109 // optimized copy/blend loops to perform at their best, most triangles
2110 // have only one run of pixels, and do the search using wide reads...
2114 // if this pixel is masked off, it's probably not alone...
2121 // the 4-item search must be aligned or else it stalls badly
2122 if ((x & 3) && !pixelmask[x])
2124 if(pixelmask[x]) goto endmasked;
2128 if(pixelmask[x]) goto endmasked;
2132 if(pixelmask[x]) goto endmasked;
2137 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2141 for (;!pixelmask[x];x++)
2143 // rather than continue the loop, just check the end variable
2148 // find length of subspan
2151 if (subx + 8 < endx)
2155 if(!pixelmask[subx]) goto endunmasked;
2159 if(!pixelmask[subx]) goto endunmasked;
2163 if(!pixelmask[subx]) goto endunmasked;
2168 while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2172 for (;pixelmask[subx];subx++)
2174 // the checks can overshoot, so make sure to clip it...
2178 // now that we know the subspan length... process!
2179 switch(thread->fb_blendmode)
2181 case DPSOFTRAST_BLENDMODE_OPAQUE:
2185 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2190 while (x + 16 <= subx)
2192 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2193 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2194 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2195 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2200 while (x + 4 <= subx)
2202 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2208 pixeli[x+1] = ini[x+1];
2218 case DPSOFTRAST_BLENDMODE_ALPHA:
2219 #define FINISHBLEND(blend2, blend1) \
2220 for (;x + 1 < subx;x += 2) \
2223 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2224 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2226 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2231 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2232 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2234 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2238 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2239 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2241 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2242 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2245 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2247 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2248 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2250 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2251 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2254 case DPSOFTRAST_BLENDMODE_ADD:
2255 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2257 case DPSOFTRAST_BLENDMODE_INVMOD:
2259 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2261 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2264 case DPSOFTRAST_BLENDMODE_MUL:
2265 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2267 case DPSOFTRAST_BLENDMODE_MUL2:
2268 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2270 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2272 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2273 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2275 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2276 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2279 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2281 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2282 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2284 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2285 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2288 case DPSOFTRAST_BLENDMODE_INVADD:
2290 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2292 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2300 static void DPSOFTRAST_Texture2DBGRA8(DPSOFTRAST_Texture *texture, int mip, float x, float y, unsigned char c[4])
2301 // warning: this is SLOW, only use if the optimized per-span functions won't do
2303 const unsigned char * RESTRICT pixelbase;
2304 const unsigned char * RESTRICT pixel[4];
2305 int width = texture->mipmap[mip][2], height = texture->mipmap[mip][3];
2306 int wrapmask[2] = { width-1, height-1 };
2307 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2308 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
2310 unsigned int tc[2] = { x * (width<<12) - 2048, y * (height<<12) - 2048};
2311 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2312 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2313 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2314 int tci[2] = { tc[0]>>12, tc[1]>>12 };
2315 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2316 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2318 tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2319 tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2320 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= wrapmask[0] ? tci1[0] : wrapmask[0]) : 0;
2321 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= wrapmask[1] ? tci1[1] : wrapmask[1]) : 0;
2325 tci[0] &= wrapmask[0];
2326 tci[1] &= wrapmask[1];
2327 tci1[0] &= wrapmask[0];
2328 tci1[1] &= wrapmask[1];
2330 pixel[0] = pixelbase + 4 * (tci[1]*width+tci[0]);
2331 pixel[1] = pixelbase + 4 * (tci[1]*width+tci1[0]);
2332 pixel[2] = pixelbase + 4 * (tci1[1]*width+tci[0]);
2333 pixel[3] = pixelbase + 4 * (tci1[1]*width+tci1[0]);
2334 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
2335 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
2336 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
2337 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
2341 int tci[2] = { x * width, y * height };
2342 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2344 tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2345 tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2349 tci[0] &= wrapmask[0];
2350 tci[1] &= wrapmask[1];
2352 pixel[0] = pixelbase + 4 * (tci[1]*width+tci[0]);
2360 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2363 int startx = span->startx;
2364 int endx = span->endx;
2369 float tc[2], endtc[2];
2371 unsigned int tci[2];
2372 unsigned int tci1[2];
2373 unsigned int tcimin[2];
2374 unsigned int tcimax[2];
2379 const unsigned char * RESTRICT pixelbase;
2380 const unsigned char * RESTRICT pixel[4];
2381 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2382 // if no texture is bound, just fill it with white
2385 for (x = startx;x < endx;x++)
2387 out4f[x*4+0] = 1.0f;
2388 out4f[x*4+1] = 1.0f;
2389 out4f[x*4+2] = 1.0f;
2390 out4f[x*4+3] = 1.0f;
2394 mip = triangle->mip[texunitindex];
2395 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2396 // if this mipmap of the texture is 1 pixel, just fill it with that color
2397 if (texture->mipmap[mip][1] == 4)
2399 c[0] = texture->bytes[2] * (1.0f/255.0f);
2400 c[1] = texture->bytes[1] * (1.0f/255.0f);
2401 c[2] = texture->bytes[0] * (1.0f/255.0f);
2402 c[3] = texture->bytes[3] * (1.0f/255.0f);
2403 for (x = startx;x < endx;x++)
2405 out4f[x*4+0] = c[0];
2406 out4f[x*4+1] = c[1];
2407 out4f[x*4+2] = c[2];
2408 out4f[x*4+3] = c[3];
2412 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2413 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2414 flags = texture->flags;
2415 tcscale[0] = texture->mipmap[mip][2];
2416 tcscale[1] = texture->mipmap[mip][3];
2417 tciwidth = texture->mipmap[mip][2];
2420 tcimax[0] = texture->mipmap[mip][2]-1;
2421 tcimax[1] = texture->mipmap[mip][3]-1;
2422 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2423 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2424 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2425 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2431 for (x = startx;x < endx;)
2433 unsigned int subtc[2];
2434 unsigned int substep[2];
2435 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2436 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2437 if (nextsub >= endx)
2439 nextsub = endsub = endx-1;
2440 if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2444 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2445 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2451 substep[0] = (endtc[0] - tc[0]) * subscale;
2452 substep[1] = (endtc[1] - tc[1]) * subscale;
2453 subtc[0] = tc[0] * (1<<12);
2454 subtc[1] = tc[1] * (1<<12);
2457 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2459 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2461 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2462 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2463 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2464 tci[0] = subtc[0]>>12;
2465 tci[1] = subtc[1]>>12;
2466 tci1[0] = tci[0] + 1;
2467 tci1[1] = tci[1] + 1;
2468 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2469 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2470 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2471 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2472 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2473 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2474 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2475 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2476 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2477 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2478 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2479 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2480 out4f[x*4+0] = c[0];
2481 out4f[x*4+1] = c[1];
2482 out4f[x*4+2] = c[2];
2483 out4f[x*4+3] = c[3];
2488 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2490 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2491 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2492 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2493 tci[0] = subtc[0]>>12;
2494 tci[1] = subtc[1]>>12;
2495 tci1[0] = tci[0] + 1;
2496 tci1[1] = tci[1] + 1;
2497 tci[0] &= tciwrapmask[0];
2498 tci[1] &= tciwrapmask[1];
2499 tci1[0] &= tciwrapmask[0];
2500 tci1[1] &= tciwrapmask[1];
2501 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2502 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2503 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2504 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2505 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2506 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2507 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2508 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2509 out4f[x*4+0] = c[0];
2510 out4f[x*4+1] = c[1];
2511 out4f[x*4+2] = c[2];
2512 out4f[x*4+3] = c[3];
2516 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2518 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2520 tci[0] = subtc[0]>>12;
2521 tci[1] = subtc[1]>>12;
2522 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2523 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2524 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2525 c[0] = pixel[0][2] * (1.0f / 255.0f);
2526 c[1] = pixel[0][1] * (1.0f / 255.0f);
2527 c[2] = pixel[0][0] * (1.0f / 255.0f);
2528 c[3] = pixel[0][3] * (1.0f / 255.0f);
2529 out4f[x*4+0] = c[0];
2530 out4f[x*4+1] = c[1];
2531 out4f[x*4+2] = c[2];
2532 out4f[x*4+3] = c[3];
2537 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2539 tci[0] = subtc[0]>>12;
2540 tci[1] = subtc[1]>>12;
2541 tci[0] &= tciwrapmask[0];
2542 tci[1] &= tciwrapmask[1];
2543 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2544 c[0] = pixel[0][2] * (1.0f / 255.0f);
2545 c[1] = pixel[0][1] * (1.0f / 255.0f);
2546 c[2] = pixel[0][0] * (1.0f / 255.0f);
2547 c[3] = pixel[0][3] * (1.0f / 255.0f);
2548 out4f[x*4+0] = c[0];
2549 out4f[x*4+1] = c[1];
2550 out4f[x*4+2] = c[2];
2551 out4f[x*4+3] = c[3];
2557 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2561 int startx = span->startx;
2562 int endx = span->endx;
2564 __m128 data, slope, tcscale;
2565 __m128i tcsize, tcmask, tcoffset, tcmax;
2567 __m128i subtc, substep, endsubtc;
2570 int affine; // LordHavoc: optimized affine texturing case
2571 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2572 const unsigned char * RESTRICT pixelbase;
2573 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2574 // if no texture is bound, just fill it with white
2577 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2580 mip = triangle->mip[texunitindex];
2581 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2582 // if this mipmap of the texture is 1 pixel, just fill it with that color
2583 if (texture->mipmap[mip][1] == 4)
2585 unsigned int k = *((const unsigned int *)pixelbase);
2586 for (x = startx;x < endx;x++)
2590 affine = zf[startx] == zf[endx-1];
2591 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2592 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2593 flags = texture->flags;
2594 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2595 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2596 tcscale = _mm_cvtepi32_ps(tcsize);
2597 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2598 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2599 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2601 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2602 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2603 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2604 tcmax = _mm_packs_epi32(tcmask, tcmask);
2605 for (x = startx;x < endx;)
2607 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2608 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2609 if (nextsub >= endx || affine)
2611 nextsub = endsub = endx-1;
2612 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2616 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2618 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2619 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2620 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2621 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2622 substep = _mm_slli_epi32(substep, 1);
2625 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2626 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2628 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2629 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2631 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2632 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2633 tci = _mm_madd_epi16(tci, tcoffset);
2634 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2635 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2636 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2637 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2638 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2639 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2640 fracm = _mm_srli_epi16(subtc, 1);
2641 pix1 = _mm_add_epi16(pix1,
2642 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2643 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2644 pix3 = _mm_add_epi16(pix3,
2645 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2646 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2647 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2648 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2649 pix2 = _mm_add_epi16(pix2,
2650 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2651 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2652 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2656 const unsigned char * RESTRICT ptr1;
2657 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2658 tci = _mm_madd_epi16(tci, tcoffset);
2659 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2660 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2661 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2662 fracm = _mm_srli_epi16(subtc, 1);
2663 pix1 = _mm_add_epi16(pix1,
2664 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2665 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2666 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2667 pix1 = _mm_add_epi16(pix1,
2668 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2669 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2670 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2674 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2676 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2678 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2679 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2680 tci = _mm_madd_epi16(tci, tcoffset);
2681 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2682 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2683 _mm_setzero_si128());
2684 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2685 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2686 _mm_setzero_si128());
2687 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2688 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2689 tci = _mm_madd_epi16(tci, tcoffset);
2690 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2691 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2692 _mm_setzero_si128());
2693 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2694 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2695 _mm_setzero_si128());
2696 fracm = _mm_srli_epi16(subtc, 1);
2697 pix1 = _mm_add_epi16(pix1,
2698 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2699 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2700 pix3 = _mm_add_epi16(pix3,
2701 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2702 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2703 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2704 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2705 pix2 = _mm_add_epi16(pix2,
2706 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2707 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2708 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2712 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2713 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2714 tci = _mm_madd_epi16(tci, tcoffset);
2715 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2716 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2717 _mm_setzero_si128());
2718 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2719 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2720 _mm_setzero_si128());
2721 fracm = _mm_srli_epi16(subtc, 1);
2722 pix1 = _mm_add_epi16(pix1,
2723 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2724 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2725 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2726 pix1 = _mm_add_epi16(pix1,
2727 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2728 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2729 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2735 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2737 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2738 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2739 tci = _mm_madd_epi16(tci, tcoffset);
2740 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2741 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2742 _mm_setzero_si128());
2743 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2744 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2745 _mm_setzero_si128());
2746 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2747 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2748 tci = _mm_madd_epi16(tci, tcoffset);
2749 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2750 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2751 _mm_setzero_si128());
2752 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2753 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2754 _mm_setzero_si128());
2755 fracm = _mm_srli_epi16(subtc, 1);
2756 pix1 = _mm_add_epi16(pix1,
2757 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2758 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2759 pix3 = _mm_add_epi16(pix3,
2760 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2761 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2762 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2763 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2764 pix2 = _mm_add_epi16(pix2,
2765 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2766 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2767 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2771 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2772 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2773 tci = _mm_madd_epi16(tci, tcoffset);
2774 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2775 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2776 _mm_setzero_si128());
2777 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2778 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2779 _mm_setzero_si128());
2780 fracm = _mm_srli_epi16(subtc, 1);
2781 pix1 = _mm_add_epi16(pix1,
2782 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2783 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2784 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2785 pix1 = _mm_add_epi16(pix1,
2786 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2787 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2788 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2795 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2797 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2799 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2800 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2801 tci = _mm_madd_epi16(tci, tcoffset);
2802 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2803 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2807 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2808 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2809 tci = _mm_madd_epi16(tci, tcoffset);
2810 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2816 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2818 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2819 tci = _mm_and_si128(tci, tcmax);
2820 tci = _mm_madd_epi16(tci, tcoffset);
2821 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2822 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2826 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2827 tci = _mm_and_si128(tci, tcmax);
2828 tci = _mm_madd_epi16(tci, tcoffset);
2829 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2838 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2841 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2844 float DPSOFTRAST_SampleShadowmap(const float *vector)
2850 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2853 int startx = span->startx;
2854 int endx = span->endx;
2859 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2860 for (x = startx;x < endx;x++)
2863 c[0] = (data[0] + slope[0]*x) * z;
2864 c[1] = (data[1] + slope[1]*x) * z;
2865 c[2] = (data[2] + slope[2]*x) * z;
2866 c[3] = (data[3] + slope[3]*x) * z;
2867 out4f[x*4+0] = in4f[x*4+0] * c[0];
2868 out4f[x*4+1] = in4f[x*4+1] * c[1];
2869 out4f[x*4+2] = in4f[x*4+2] * c[2];
2870 out4f[x*4+3] = in4f[x*4+3] * c[3];
2874 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2877 int startx = span->startx;
2878 int endx = span->endx;
2883 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2884 for (x = startx;x < endx;x++)
2887 c[0] = (data[0] + slope[0]*x) * z;
2888 c[1] = (data[1] + slope[1]*x) * z;
2889 c[2] = (data[2] + slope[2]*x) * z;
2890 c[3] = (data[3] + slope[3]*x) * z;
2891 out4f[x*4+0] = c[0];
2892 out4f[x*4+1] = c[1];
2893 out4f[x*4+2] = c[2];
2894 out4f[x*4+3] = c[3];
2898 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2900 int x, startx = span->startx, endx = span->endx;
2901 float c[4], localcolor[4];
2902 localcolor[0] = subcolor[0];
2903 localcolor[1] = subcolor[1];
2904 localcolor[2] = subcolor[2];
2905 localcolor[3] = subcolor[3];
2906 for (x = startx;x < endx;x++)
2908 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2909 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2910 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2911 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2912 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2913 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2914 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2915 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2919 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2921 int x, startx = span->startx, endx = span->endx;
2922 for (x = startx;x < endx;x++)
2924 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2925 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2926 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2927 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2931 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2933 int x, startx = span->startx, endx = span->endx;
2934 for (x = startx;x < endx;x++)
2936 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2937 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2938 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2939 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2943 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2945 int x, startx = span->startx, endx = span->endx;
2947 for (x = startx;x < endx;x++)
2949 a = 1.0f - inb4f[x*4+3];
2951 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2952 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2953 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2954 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2958 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2960 int x, startx = span->startx, endx = span->endx;
2961 float localcolor[4], ilerp, lerp;
2962 localcolor[0] = color[0];
2963 localcolor[1] = color[1];
2964 localcolor[2] = color[2];
2965 localcolor[3] = color[3];
2966 ilerp = 1.0f - localcolor[3];
2967 lerp = localcolor[3];
2968 for (x = startx;x < endx;x++)
2970 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2971 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2972 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2973 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2979 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2983 int startx = span->startx;
2984 int endx = span->endx;
2987 __m128i submod, substep, endsubmod;
2988 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2989 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2990 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2991 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2992 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2993 for (x = startx; x < endx;)
2995 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2996 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2997 if (nextsub >= endx)
2999 nextsub = endsub = endx-1;
3000 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
3004 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3005 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3006 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3007 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3008 substep = _mm_packs_epi32(substep, substep);
3009 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3011 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3012 pix = _mm_mulhi_epu16(pix, submod);
3013 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3017 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3018 pix = _mm_mulhi_epu16(pix, submod);
3019 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3026 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3030 int startx = span->startx;
3031 int endx = span->endx;
3034 __m128i submod, substep, endsubmod;
3035 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3036 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3037 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3038 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3039 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3040 for (x = startx; x < endx;)
3042 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3043 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3044 if (nextsub >= endx)
3046 nextsub = endsub = endx-1;
3047 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3051 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3052 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3053 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3054 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3055 substep = _mm_packs_epi32(substep, substep);
3056 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3058 __m128i pix = _mm_srai_epi16(submod, 4);
3059 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3063 __m128i pix = _mm_srai_epi16(submod, 4);
3064 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3071 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3074 int x, startx = span->startx, endx = span->endx;
3075 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3076 localcolor = _mm_packs_epi32(localcolor, localcolor);
3077 for (x = startx;x+2 <= endx;x+=2)
3079 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3080 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3081 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3082 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3086 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3087 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3088 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3089 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3094 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3097 int x, startx = span->startx, endx = span->endx;
3098 for (x = startx;x+2 <= endx;x+=2)
3100 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3101 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3102 pix1 = _mm_mulhi_epu16(pix1, pix2);
3103 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3107 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3108 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3109 pix1 = _mm_mulhi_epu16(pix1, pix2);
3110 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3115 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3118 int x, startx = span->startx, endx = span->endx;
3119 for (x = startx;x+2 <= endx;x+=2)
3121 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3122 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3123 pix1 = _mm_add_epi16(pix1, pix2);
3124 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3128 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3129 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3130 pix1 = _mm_add_epi16(pix1, pix2);
3131 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3136 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3139 int x, startx = span->startx, endx = span->endx;
3140 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3141 tint = _mm_packs_epi32(tint, tint);
3142 for (x = startx;x+2 <= endx;x+=2)
3144 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3145 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3146 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3147 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3151 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3152 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3153 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3154 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3159 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3162 int x, startx = span->startx, endx = span->endx;
3163 for (x = startx;x+2 <= endx;x+=2)
3165 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3166 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3167 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3168 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3169 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3173 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3174 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3175 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3176 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3177 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3182 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3185 int x, startx = span->startx, endx = span->endx;
3186 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3187 localcolor = _mm_packs_epi32(localcolor, localcolor);
3188 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3189 for (x = startx;x+2 <= endx;x+=2)
3191 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3192 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3193 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3197 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3198 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3199 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3206 void DPSOFTRAST_VertexShader_Generic(void)
3208 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3209 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3210 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3211 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3212 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3215 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3217 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3218 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3219 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3220 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3221 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3222 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3224 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3225 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3226 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3228 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3229 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3232 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3234 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3237 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3239 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3242 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3247 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3248 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3253 void DPSOFTRAST_VertexShader_PostProcess(void)
3255 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3256 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3257 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3260 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3262 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3263 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3264 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3265 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3266 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3267 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3268 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3270 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3271 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3273 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3274 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3276 // TODO: implement saturation
3278 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3280 // TODO: implement gammaramps
3282 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3287 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3289 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3292 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3294 // this is never called (because colormask is off when this shader is used)
3295 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3296 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3297 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3298 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3299 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3304 void DPSOFTRAST_VertexShader_FlatColor(void)
3306 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3307 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3310 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3313 unsigned char * RESTRICT pixelmask = span->pixelmask;
3314 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3315 int x, startx = span->startx, endx = span->endx;
3316 __m128i Color_Ambientm;
3317 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3318 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3319 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3320 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3321 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3322 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3323 pixel = buffer_FragColorbgra8;
3324 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3325 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3326 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3327 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3328 for (x = startx;x < endx;x++)
3331 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3334 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3335 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3336 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3337 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3343 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3344 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3345 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3347 if (pixel == buffer_FragColorbgra8)
3348 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3354 void DPSOFTRAST_VertexShader_VertexColor(void)
3356 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3357 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3358 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3361 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3364 unsigned char * RESTRICT pixelmask = span->pixelmask;
3365 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3366 int x, startx = span->startx, endx = span->endx;
3367 __m128i Color_Ambientm, Color_Diffusem;
3369 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3370 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3371 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3372 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3373 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3374 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3375 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3376 pixel = buffer_FragColorbgra8;
3377 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3378 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3379 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3380 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3381 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3382 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3383 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3384 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3385 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3386 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3387 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3388 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3389 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3390 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3392 __m128i color, mod, pix;
3393 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3396 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3397 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3398 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3399 data = _mm_add_ps(data, slope);
3400 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3401 data = _mm_add_ps(data, slope);
3402 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3403 data = _mm_add_ps(data, slope);
3404 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3405 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3406 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3407 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3408 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3409 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3415 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3416 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3417 mod = _mm_packs_epi32(mod, mod);
3418 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3419 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3421 if (pixel == buffer_FragColorbgra8)
3422 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3428 void DPSOFTRAST_VertexShader_Lightmap(void)
3430 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3431 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3432 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3435 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3438 unsigned char * RESTRICT pixelmask = span->pixelmask;
3439 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3440 int x, startx = span->startx, endx = span->endx;
3441 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3442 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3443 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3444 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3445 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3446 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3447 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3448 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3449 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3450 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3451 pixel = buffer_FragColorbgra8;
3452 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3453 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3454 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3455 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3456 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3457 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3458 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3459 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3461 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3462 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3463 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3464 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3465 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3466 for (x = startx;x < endx;x++)
3468 __m128i color, lightmap, glow, pix;
3469 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3472 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3473 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3474 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3475 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3476 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3477 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3478 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3479 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3480 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3481 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3487 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3488 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3489 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3490 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3491 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3492 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3497 for (x = startx;x < endx;x++)
3499 __m128i color, lightmap, pix;
3500 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3503 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3504 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3505 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3506 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3507 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3508 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3509 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3515 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3516 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3517 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3518 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3521 if (pixel == buffer_FragColorbgra8)
3522 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3527 void DPSOFTRAST_VertexShader_LightDirection(void);
3528 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3530 void DPSOFTRAST_VertexShader_FakeLight(void)
3532 DPSOFTRAST_VertexShader_LightDirection();
3535 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3537 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3542 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3544 DPSOFTRAST_VertexShader_LightDirection();
3545 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3548 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3550 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3555 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3557 DPSOFTRAST_VertexShader_LightDirection();
3558 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3561 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3563 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3568 void DPSOFTRAST_VertexShader_LightDirection(void)
3571 int numvertices = dpsoftrast.numvertices;
3573 float LightVector[4];
3574 float EyePosition[4];
3575 float EyeVectorModelSpace[4];
3581 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3582 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3583 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3584 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3585 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3586 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3587 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3588 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3589 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3590 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3591 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3592 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3593 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3594 for (i = 0;i < numvertices;i++)
3596 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3597 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3598 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3599 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3600 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3601 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3602 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3603 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3604 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3605 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3606 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3607 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3608 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3609 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3610 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3611 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3612 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3613 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3614 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3615 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3616 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3617 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3618 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3619 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3620 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3621 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3622 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3623 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3624 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3626 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3629 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3630 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3631 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3632 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3633 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3634 #define DPSOFTRAST_Vector3Normalize(v)\
3637 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3648 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3650 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3651 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3652 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3653 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3654 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3655 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3656 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3657 unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3658 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3659 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3660 int x, startx = span->startx, endx = span->endx;
3661 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3662 float LightVectordata[4];
3663 float LightVectorslope[4];
3664 float EyeVectordata[4];
3665 float EyeVectorslope[4];
3666 float VectorSdata[4];
3667 float VectorSslope[4];
3668 float VectorTdata[4];
3669 float VectorTslope[4];
3670 float VectorRdata[4];
3671 float VectorRslope[4];
3673 float diffusetex[4];
3675 float surfacenormal[4];
3676 float lightnormal[4];
3677 float lightnormal_modelspace[4];
3679 float specularnormal[4];
3682 float SpecularPower;
3684 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3685 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3686 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3687 Color_Glow[3] = 0.0f;
3688 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3689 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3690 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3691 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3692 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3693 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3694 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3695 Color_Pants[3] = 0.0f;
3696 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3697 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3698 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3699 Color_Shirt[3] = 0.0f;
3700 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3701 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3702 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3704 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3705 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3707 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3709 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3711 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3713 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3714 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3715 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3716 Color_Diffuse[3] = 0.0f;
3717 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3718 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3719 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3720 LightColor[3] = 0.0f;
3721 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3722 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3723 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3724 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3725 Color_Specular[3] = 0.0f;
3726 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3727 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3728 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3730 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3732 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3733 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3734 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3735 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3736 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3738 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3740 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3741 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3743 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3745 // nothing of this needed
3749 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3752 for (x = startx;x < endx;x++)
3755 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3756 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3757 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3758 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3759 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3761 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3762 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3763 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3764 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3766 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3767 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3768 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3769 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3770 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3771 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3772 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3773 DPSOFTRAST_Vector3Normalize(surfacenormal);
3775 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3777 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3778 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3779 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3780 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3782 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3783 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3784 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3785 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3787 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3788 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3789 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3790 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3792 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3793 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3794 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3795 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3797 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3798 DPSOFTRAST_Vector3Normalize(lightnormal);
3800 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3802 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3803 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3804 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3805 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3808 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3810 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3811 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3812 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3814 float f = 1.0f / 256.0f;
3815 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3816 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3817 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3820 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3822 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3823 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3824 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3825 DPSOFTRAST_Vector3Normalize(lightnormal);
3827 LightColor[0] = 1.0;
3828 LightColor[1] = 1.0;
3829 LightColor[2] = 1.0;
3833 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3834 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3835 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3836 DPSOFTRAST_Vector3Normalize(lightnormal);
3839 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3841 if(thread->shader_exactspecularmath)
3843 // reflect lightnormal at surfacenormal, take the negative of that
3844 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3846 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3847 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3848 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3849 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3851 // dot of this and normalize(EyeVectorFogDepth.xyz)
3852 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3853 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3854 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3855 DPSOFTRAST_Vector3Normalize(eyenormal);
3857 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3861 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3862 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3863 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3864 DPSOFTRAST_Vector3Normalize(eyenormal);
3866 specularnormal[0] = lightnormal[0] + eyenormal[0];
3867 specularnormal[1] = lightnormal[1] + eyenormal[1];
3868 specularnormal[2] = lightnormal[2] + eyenormal[2];
3869 DPSOFTRAST_Vector3Normalize(specularnormal);
3871 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3874 specular = pow(specular, SpecularPower * glosstex[3]);
3875 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3877 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3878 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3879 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3880 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3884 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3885 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3886 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3887 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3890 buffer_FragColorbgra8[x*4+0] = d[0];
3891 buffer_FragColorbgra8[x*4+1] = d[1];
3892 buffer_FragColorbgra8[x*4+2] = d[2];
3893 buffer_FragColorbgra8[x*4+3] = d[3];
3896 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3898 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3899 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3900 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3901 Color_Diffuse[3] = 0.0f;
3902 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3903 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3904 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3905 LightColor[3] = 0.0f;
3906 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3908 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3910 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3911 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3912 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3913 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3914 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3916 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3918 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3919 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3921 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3923 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3927 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3930 for (x = startx;x < endx;x++)
3933 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3934 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3935 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3936 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3937 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3938 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3939 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3940 DPSOFTRAST_Vector3Normalize(surfacenormal);
3942 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3944 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3945 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3946 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3947 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3949 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3950 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3951 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3952 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3954 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3955 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3956 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3957 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3959 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3960 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3961 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3962 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3964 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3965 DPSOFTRAST_Vector3Normalize(lightnormal);
3967 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3969 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3970 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3971 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3972 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3975 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3977 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3978 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3979 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3981 float f = 1.0f / 256.0f;
3982 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3983 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3984 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3987 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3989 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3990 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3991 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3992 DPSOFTRAST_Vector3Normalize(lightnormal);
3994 LightColor[0] = 1.0;
3995 LightColor[1] = 1.0;
3996 LightColor[2] = 1.0;
4000 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4001 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4002 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4003 DPSOFTRAST_Vector3Normalize(lightnormal);
4006 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4007 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4009 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4010 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4011 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4012 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4016 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4017 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4018 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4019 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4021 buffer_FragColorbgra8[x*4+0] = d[0];
4022 buffer_FragColorbgra8[x*4+1] = d[1];
4023 buffer_FragColorbgra8[x*4+2] = d[2];
4024 buffer_FragColorbgra8[x*4+3] = d[3];
4029 for (x = startx;x < endx;x++)
4032 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4033 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4034 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4035 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4037 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4039 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4040 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4041 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4042 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4046 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4047 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4048 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4049 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4051 buffer_FragColorbgra8[x*4+0] = d[0];
4052 buffer_FragColorbgra8[x*4+1] = d[1];
4053 buffer_FragColorbgra8[x*4+2] = d[2];
4054 buffer_FragColorbgra8[x*4+3] = d[3];
4057 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4062 void DPSOFTRAST_VertexShader_LightSource(void)
4065 int numvertices = dpsoftrast.numvertices;
4066 float LightPosition[4];
4067 float LightVector[4];
4068 float LightVectorModelSpace[4];
4069 float EyePosition[4];
4070 float EyeVectorModelSpace[4];
4076 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4077 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4078 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4079 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4080 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4081 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4082 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4083 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4084 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4085 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4086 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4087 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4088 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4089 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4090 for (i = 0;i < numvertices;i++)
4092 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4093 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4094 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4095 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4096 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4097 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4098 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4099 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4100 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4101 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4102 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4103 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4104 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4105 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4106 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4107 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4108 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4109 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
4110 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4111 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4112 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4113 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4114 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4115 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4116 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4117 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4118 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4119 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4120 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4121 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4122 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4123 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4125 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4126 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4129 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4132 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4133 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4134 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4135 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4136 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4137 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4138 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4139 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4140 int x, startx = span->startx, endx = span->endx;
4141 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4142 float CubeVectordata[4];
4143 float CubeVectorslope[4];
4144 float LightVectordata[4];
4145 float LightVectorslope[4];
4146 float EyeVectordata[4];
4147 float EyeVectorslope[4];
4149 float diffusetex[4];
4151 float surfacenormal[4];
4152 float lightnormal[4];
4154 float specularnormal[4];
4157 float SpecularPower;
4158 float CubeVector[4];
4161 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4162 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4163 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4164 Color_Glow[3] = 0.0f;
4165 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4166 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4167 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4168 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4169 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4170 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4171 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4172 Color_Diffuse[3] = 0.0f;
4173 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4174 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4175 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4176 Color_Specular[3] = 0.0f;
4177 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4178 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4179 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4180 Color_Pants[3] = 0.0f;
4181 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4182 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4183 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4184 Color_Shirt[3] = 0.0f;
4185 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4186 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4187 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4188 LightColor[3] = 0.0f;
4189 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4190 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4191 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4192 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4193 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4194 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4195 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4196 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4198 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4199 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4201 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4202 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4203 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4205 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4206 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4207 for (x = startx;x < endx;x++)
4210 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4211 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4212 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4213 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4214 if (attenuation < 0.01f)
4216 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4218 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4219 if (attenuation < 0.01f)
4223 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4224 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4225 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4226 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4227 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4229 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4230 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4231 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4232 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4234 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4235 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4236 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4237 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4238 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4239 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4240 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4241 DPSOFTRAST_Vector3Normalize(surfacenormal);
4243 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4244 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4245 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4246 DPSOFTRAST_Vector3Normalize(lightnormal);
4248 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4250 if(thread->shader_exactspecularmath)
4252 // reflect lightnormal at surfacenormal, take the negative of that
4253 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4255 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4256 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4257 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4258 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4260 // dot of this and normalize(EyeVectorFogDepth.xyz)
4261 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4262 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4263 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4264 DPSOFTRAST_Vector3Normalize(eyenormal);
4266 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4270 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4271 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4272 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4273 DPSOFTRAST_Vector3Normalize(eyenormal);
4275 specularnormal[0] = lightnormal[0] + eyenormal[0];
4276 specularnormal[1] = lightnormal[1] + eyenormal[1];
4277 specularnormal[2] = lightnormal[2] + eyenormal[2];
4278 DPSOFTRAST_Vector3Normalize(specularnormal);
4280 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4282 specular = pow(specular, SpecularPower * glosstex[3]);
4284 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4286 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4287 attenuation *= (1.0f / 255.0f);
4288 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4289 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4290 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4291 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4295 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4296 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4297 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4298 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4300 buffer_FragColorbgra8[x*4+0] = d[0];
4301 buffer_FragColorbgra8[x*4+1] = d[1];
4302 buffer_FragColorbgra8[x*4+2] = d[2];
4303 buffer_FragColorbgra8[x*4+3] = d[3];
4306 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4308 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4309 for (x = startx;x < endx;x++)
4312 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4313 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4314 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4315 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4316 if (attenuation < 0.01f)
4318 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4320 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4321 if (attenuation < 0.01f)
4325 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4326 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4327 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4328 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4329 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4331 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4332 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4333 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4334 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4336 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4337 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4338 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4339 DPSOFTRAST_Vector3Normalize(surfacenormal);
4341 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4342 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4343 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4344 DPSOFTRAST_Vector3Normalize(lightnormal);
4346 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4347 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4349 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4350 attenuation *= (1.0f / 255.0f);
4351 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4352 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4353 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4354 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4358 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4359 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4360 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4361 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4363 buffer_FragColorbgra8[x*4+0] = d[0];
4364 buffer_FragColorbgra8[x*4+1] = d[1];
4365 buffer_FragColorbgra8[x*4+2] = d[2];
4366 buffer_FragColorbgra8[x*4+3] = d[3];
4371 for (x = startx;x < endx;x++)
4374 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4375 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4376 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4377 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4378 if (attenuation < 0.01f)
4380 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4382 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4383 if (attenuation < 0.01f)
4387 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4388 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4389 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4390 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4391 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4393 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4394 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4395 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4396 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4398 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4400 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4401 attenuation *= (1.0f / 255.0f);
4402 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4403 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4404 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4405 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4409 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4410 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4411 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4412 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4414 buffer_FragColorbgra8[x*4+0] = d[0];
4415 buffer_FragColorbgra8[x*4+1] = d[1];
4416 buffer_FragColorbgra8[x*4+2] = d[2];
4417 buffer_FragColorbgra8[x*4+3] = d[3];
4420 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4426 void DPSOFTRAST_VertexShader_Refraction(void)
4428 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4429 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4430 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4433 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4435 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4437 int x, startx = span->startx, endx = span->endx;
4440 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4441 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4444 float ModelViewProjectionPositiondata[4];
4445 float ModelViewProjectionPositionslope[4];
4448 float ScreenScaleRefractReflect[2];
4449 float ScreenCenterRefractReflect[2];
4450 float DistortScaleRefractReflect[2];
4451 float RefractColor[4];
4453 DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4454 if(!texture) return;
4457 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4458 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4461 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4464 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4465 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4466 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4467 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4468 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4469 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4470 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4471 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4472 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4473 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4476 for (x = startx;x < endx;x++)
4478 float SafeScreenTexCoord[2];
4479 float ScreenTexCoord[2];
4486 // " vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4487 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4489 // " vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4490 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4491 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4493 // " vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4494 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4495 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4496 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4497 DPSOFTRAST_Vector3Normalize(v);
4498 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4499 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4501 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4502 DPSOFTRAST_Texture2DBGRA8(texture, 0, ScreenTexCoord[0], ScreenTexCoord[1], c);
4504 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4505 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4506 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4507 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4510 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4515 void DPSOFTRAST_VertexShader_Water(void)
4518 int numvertices = dpsoftrast.numvertices;
4519 float EyePosition[4];
4520 float EyeVectorModelSpace[4];
4526 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4527 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4528 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4529 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4530 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4531 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4532 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4533 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4534 for (i = 0;i < numvertices;i++)
4536 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4537 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4538 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4539 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4540 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4541 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4542 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4543 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4544 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4545 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4546 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4547 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4548 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4549 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4550 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4551 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4552 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4553 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4554 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
4555 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
4556 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
4557 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
4559 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4560 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4561 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4565 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4567 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4569 int x, startx = span->startx, endx = span->endx;
4572 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4573 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4576 float ModelViewProjectionPositiondata[4];
4577 float ModelViewProjectionPositionslope[4];
4578 float EyeVectordata[4];
4579 float EyeVectorslope[4];
4582 float ScreenScaleRefractReflect[2];
4583 float ScreenCenterRefractReflect[2];
4584 float DistortScaleRefractReflect[2];
4585 float RefractColor[4];
4586 float ReflectColor[4];
4587 float ReflectFactor;
4588 float ReflectOffset;
4590 DPSOFTRAST_Texture *texture_refraction = thread->texbound[GL20TU_REFRACTION];
4591 DPSOFTRAST_Texture *texture_reflection = thread->texbound[GL20TU_REFLECTION];
4592 if(!texture_refraction || !texture_reflection) return;
4595 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4596 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4599 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4600 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
4603 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4604 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4605 ScreenScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+2];
4606 ScreenScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+3];
4607 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4608 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4609 ScreenCenterRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+2];
4610 ScreenCenterRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+3];
4611 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4612 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4613 DistortScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+2];
4614 DistortScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+3];
4615 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4616 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4617 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4618 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4619 ReflectColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+2];
4620 ReflectColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+1];
4621 ReflectColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+0];
4622 ReflectColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+3];
4623 ReflectFactor = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectFactor*4+0];
4624 ReflectOffset = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectOffset*4+0];
4627 for (x = startx;x < endx;x++)
4629 float SafeScreenTexCoord[4];
4630 float ScreenTexCoord[4];
4633 unsigned char c1[4];
4634 unsigned char c2[4];
4639 // " vec4 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect * (1.0 / ModelViewProjectionPosition.w);\n"
4640 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4642 // " vec4 SafeScreenTexCoord = ModelViewProjectionPosition.xyxy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect;\n"
4643 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4644 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4645 SafeScreenTexCoord[2] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[2] + ScreenCenterRefractReflect[2]; // * z (disappears)
4646 SafeScreenTexCoord[3] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[3] + ScreenCenterRefractReflect[3]; // * z (disappears)
4648 // " vec4 ScreenTexCoord = SafeScreenTexCoord + vec2(normalize(vec3(dp_texture2D(Texture_Normal, TexCoord)) - vec3(0.5))).xyxy * DistortScaleRefractReflect;\n"
4649 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4650 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4651 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4652 DPSOFTRAST_Vector3Normalize(v);
4653 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4654 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4655 ScreenTexCoord[2] = SafeScreenTexCoord[2] + v[0] * DistortScaleRefractReflect[2];
4656 ScreenTexCoord[3] = SafeScreenTexCoord[3] + v[1] * DistortScaleRefractReflect[3];
4658 // " float Fresnel = pow(min(1.0, 1.0 - float(normalize(EyeVector).z)), 2.0) * ReflectFactor + ReflectOffset;\n"
4659 v[0] = (EyeVectordata[0] + EyeVectorslope[0] * x); // * z (disappears)
4660 v[1] = (EyeVectordata[1] + EyeVectorslope[1] * x); // * z (disappears)
4661 v[2] = (EyeVectordata[2] + EyeVectorslope[2] * x); // * z (disappears)
4662 DPSOFTRAST_Vector3Normalize(v);
4663 Fresnel = 1.0f - v[2];
4664 Fresnel = min(1.0f, Fresnel);
4665 Fresnel = Fresnel * Fresnel * ReflectFactor + ReflectOffset;
4667 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4668 // " dp_FragColor = mix(vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord.xy).rgb, 1) * RefractColor, vec4(dp_texture2D(Texture_Reflection, ScreenTexCoord.zw).rgb, 1) * ReflectColor, Fresnel);\n"
4669 DPSOFTRAST_Texture2DBGRA8(texture_refraction, 0, ScreenTexCoord[0], ScreenTexCoord[1], c1);
4670 DPSOFTRAST_Texture2DBGRA8(texture_reflection, 0, ScreenTexCoord[2], ScreenTexCoord[3], c2);
4672 buffer_FragColorbgra8[x*4+0] = (c1[0] * RefractColor[0]) * (1.0f - Fresnel) + (c2[0] * ReflectColor[0]) * Fresnel;
4673 buffer_FragColorbgra8[x*4+1] = (c1[1] * RefractColor[1]) * (1.0f - Fresnel) + (c2[1] * ReflectColor[1]) * Fresnel;
4674 buffer_FragColorbgra8[x*4+2] = (c1[2] * RefractColor[2]) * (1.0f - Fresnel) + (c2[2] * ReflectColor[2]) * Fresnel;
4675 buffer_FragColorbgra8[x*4+3] = min(( RefractColor[3] * (1.0f - Fresnel) + ReflectColor[3] * Fresnel) * 256, 255);
4678 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4683 void DPSOFTRAST_VertexShader_ShowDepth(void)
4685 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4688 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4691 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4692 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4693 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4694 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4695 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4700 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4702 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4705 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4708 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4709 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4710 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4711 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4712 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4717 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4719 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4722 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4725 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4726 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4727 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4728 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4729 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4734 typedef struct DPSOFTRAST_ShaderModeInfo_s
4737 void (*Vertex)(void);
4738 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4739 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4740 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4742 DPSOFTRAST_ShaderModeInfo;
4744 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4746 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4747 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4748 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4749 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4750 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4751 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4752 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4753 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4754 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4755 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4756 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4757 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4758 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_NORMAL, GL20TU_REFLECTION, GL20TU_REFRACTION, ~0}},
4759 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4760 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4761 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4764 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4769 unsigned int *depthpixel;
4773 unsigned char *pixelmask;
4774 DPSOFTRAST_State_Triangle *triangle;
4775 triangle = &thread->triangles[span->triangle];
4776 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4777 startx = span->startx;
4779 depth = span->depthbase;
4780 depthslope = span->depthslope;
4781 pixelmask = thread->pixelmaskarray;
4782 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4784 switch(thread->fb_depthfunc)
4787 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4788 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4789 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4790 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4791 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4792 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4793 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4795 while (startx < endx && !pixelmask[startx])
4797 while (endx > startx && !pixelmask[endx-1])
4802 // no depth testing means we're just dealing with color...
4803 memset(pixelmask + startx, 1, endx - startx);
4805 span->pixelmask = pixelmask;
4806 span->startx = startx;
4810 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4812 int x, d, depth, depthslope, startx, endx;
4813 const unsigned char *pixelmask;
4814 unsigned int *depthpixel;
4815 if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4817 depth = span->depthbase;
4818 depthslope = span->depthslope;
4819 pixelmask = span->pixelmask;
4820 startx = span->startx;
4822 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4823 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4829 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4832 DPSOFTRAST_State_Triangle *triangle;
4833 DPSOFTRAST_State_Span *span;
4834 for (i = 0; i < thread->numspans; i++)
4836 span = &thread->spans[i];
4837 triangle = &thread->triangles[span->triangle];
4838 DPSOFTRAST_Draw_DepthTest(thread, span);
4839 if (span->startx >= span->endx)
4841 // run pixel shader if appropriate
4842 // do this before running depthmask code, to allow the pixelshader
4843 // to clear pixelmask values for alpha testing
4844 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4845 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4846 DPSOFTRAST_Draw_DepthWrite(thread, span);
4848 thread->numspans = 0;
4851 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4853 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4856 int cullface = thread->cullface;
4857 int minx, maxx, miny, maxy;
4858 int miny1, maxy1, miny2, maxy2;
4859 __m128i fbmin, fbmax;
4860 __m128 viewportcenter, viewportscale;
4861 int firstvertex = command->firstvertex;
4862 int numvertices = command->numvertices;
4863 int numtriangles = command->numtriangles;
4864 const int *element3i = command->element3i;
4865 const unsigned short *element3s = command->element3s;
4866 int clipped = command->clipped;
4873 int starty, endy, bandy;
4877 float clip0origin, clip0slope;
4879 __m128 triangleedge1, triangleedge2, trianglenormal;
4882 DPSOFTRAST_State_Triangle *triangle;
4883 DPSOFTRAST_Texture *texture;
4884 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4885 miny = thread->fb_scissor[1];
4886 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4887 miny1 = bound(miny, thread->miny1, maxy);
4888 maxy1 = bound(miny, thread->maxy1, maxy);
4889 miny2 = bound(miny, thread->miny2, maxy);
4890 maxy2 = bound(miny, thread->maxy2, maxy);
4891 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4893 if (!ATOMIC_DECREMENT(command->refcount))
4895 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4896 MM_FREE(command->arrays);
4900 minx = thread->fb_scissor[0];
4901 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4902 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4903 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4904 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4905 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4906 screen[3] = _mm_setzero_ps();
4907 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4908 for (i = 0;i < numtriangles;i++)
4910 const float *screencoord4f = command->arrays;
4911 const float *arrays = screencoord4f + numvertices*4;
4913 // generate the 3 edges of this triangle
4914 // generate spans for the triangle - switch based on left split or right split classification of triangle
4917 e[0] = element3s[i*3+0] - firstvertex;
4918 e[1] = element3s[i*3+1] - firstvertex;
4919 e[2] = element3s[i*3+2] - firstvertex;
4923 e[0] = element3i[i*3+0] - firstvertex;
4924 e[1] = element3i[i*3+1] - firstvertex;
4925 e[2] = element3i[i*3+2] - firstvertex;
4934 #define SKIPBACKFACE \
4935 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4936 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4937 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4938 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4939 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4943 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4947 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4952 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4953 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4955 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4956 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4958 #define CLIPPEDVERTEXCOPY(k,p1) \
4959 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4961 #define GENATTRIBCOPY(attrib, p1) \
4962 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4963 #define GENATTRIBLERP(attrib, p1, p2) \
4965 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4966 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4968 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4972 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4973 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4974 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4975 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4976 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4977 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4978 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4984 // calculate distance from nearplane
4985 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4986 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4987 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4988 if (clipdist[0] >= 0.0f)
4990 if (clipdist[1] >= 0.0f)
4992 if (clipdist[2] >= 0.0f)
4995 // triangle is entirely in front of nearplane
4996 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
5003 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
5011 if (clipdist[2] >= 0.0f)
5013 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
5020 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
5027 else if (clipdist[1] >= 0.0f)
5029 if (clipdist[2] >= 0.0f)
5031 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
5038 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
5044 else if (clipdist[2] >= 0.0f)
5046 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
5051 else continue; // triangle is entirely behind nearplane
5054 // calculate integer y coords for triangle points
5055 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
5056 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
5057 screenmin = _mm_min_epi16(screeni, screenir),
5058 screenmax = _mm_max_epi16(screeni, screenir);
5059 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
5060 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
5061 screenmin = _mm_max_epi16(screenmin, fbmin);
5062 screenmax = _mm_min_epi16(screenmax, fbmax);
5063 // skip offscreen triangles
5064 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5066 starty = _mm_extract_epi16(screenmin, 1);
5067 endy = _mm_extract_epi16(screenmax, 1)+1;
5068 if (starty >= maxy1 && endy <= miny2)
5070 screeny = _mm_srai_epi32(screeni, 16);
5073 triangle = &thread->triangles[thread->numtriangles];
5075 // calculate attribute plans for triangle data...
5076 // okay, this triangle is going to produce spans, we'd better project
5077 // the interpolants now (this is what gives perspective texturing),
5078 // this consists of simply multiplying all arrays by the W coord
5079 // (which is basically 1/Z), which will be undone per-pixel
5080 // (multiplying by Z again) to get the perspective-correct array
5083 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5084 __m128 mipedgescale, mipdensity;
5085 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5086 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5087 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5088 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5089 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5090 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5091 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5092 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5093 attribedge1 = _mm_sub_ss(w0, w1);
5094 attribedge2 = _mm_sub_ss(w2, w1);
5095 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5096 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5097 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5098 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5099 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5100 _mm_store_ss(&triangle->w[0], attribxslope);
5101 _mm_store_ss(&triangle->w[1], attribyslope);
5102 _mm_store_ss(&triangle->w[2], attriborigin);
5107 if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
5109 float cliporigin, clipxslope, clipyslope;
5110 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5111 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5112 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5113 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5114 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5115 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5116 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
5117 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
5118 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
5121 clip0origin = -cliporigin/clipxslope;
5122 clip0slope = -clipyslope/clipxslope;
5123 clip0dir = clipxslope > 0 ? 1 : -1;
5125 else if(clipyslope > 0)
5127 clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5128 clip0slope = dpsoftrast.fb_width;
5131 else if(clipyslope < 0)
5133 clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5134 clip0slope = -dpsoftrast.fb_width;
5137 else if(clip0origin < 0) continue;
5140 mipedgescale = _mm_setzero_ps();
5141 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5143 __m128 attrib0, attrib1, attrib2;
5144 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5145 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5147 arrays += numvertices*4;
5148 GENATTRIBS(attrib0, attrib1, attrib2);
5149 attriborigin = _mm_mul_ps(attrib1, w1);
5150 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5151 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5152 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5153 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5154 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5155 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5156 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5157 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5158 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5160 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5161 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5162 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5163 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5167 memset(triangle->mip, 0, sizeof(triangle->mip));
5168 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5170 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5171 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5173 texture = thread->texbound[texunit];
5174 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5176 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5177 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5178 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5179 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5180 // this will be multiplied in the texturing routine by the texture resolution
5181 y = _mm_cvtss_si32(mipdensity);
5184 y = (int)(log((float)y)*0.5f/M_LN2);
5185 if (y > texture->mipmaps - 1)
5186 y = texture->mipmaps - 1;
5187 triangle->mip[texunit] = y;
5193 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5196 __m128 xcoords, xslope;
5197 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5198 int yccmask = _mm_movemask_epi8(ycc);
5199 int edge0p, edge0n, edge1p, edge1n;
5208 case 0xFFFF: /*0000*/ y = endy; continue;
5209 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5210 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5211 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5212 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5213 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5214 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5215 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5216 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5217 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5218 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5219 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5220 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5221 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5222 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5223 case 0x0000: /*1111*/ y++; continue;
5231 case 0xFFFF: /*000*/ y = endy; continue;
5232 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5233 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5234 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5235 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5236 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5237 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5238 case 0x0000: /*111*/ y++; continue;
5241 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5242 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5243 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5244 nexty = _mm_extract_epi16(ycc, 0);
5245 if (nexty >= bandy) nexty = bandy-1;
5246 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5247 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5248 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5249 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5250 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5251 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5253 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5254 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5256 clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5257 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5259 int startx, endx, offset;
5260 startx = _mm_cvtss_si32(xcoords);
5261 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5262 if (startx < minx) startx = minx;
5263 if (endx > maxx) endx = maxx;
5264 if (startx >= endx) continue;
5272 if(endx <= clip0) continue;
5273 startx = (int)clip0;
5276 else if (endx > clip0)
5278 if(startx >= clip0) continue;
5283 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5285 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5286 span->triangle = thread->numtriangles;
5290 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5291 if (span->startx >= span->endx)
5293 wslope = triangle->w[0];
5294 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5295 span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5296 span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5297 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5298 DPSOFTRAST_Draw_ProcessSpans(thread);
5303 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5305 DPSOFTRAST_Draw_ProcessSpans(thread);
5306 thread->numtriangles = 0;
5310 if (!ATOMIC_DECREMENT(command->refcount))
5312 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5313 MM_FREE(command->arrays);
5316 if (thread->numspans > 0 || thread->numtriangles > 0)
5318 DPSOFTRAST_Draw_ProcessSpans(thread);
5319 thread->numtriangles = 0;
5324 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5328 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5329 int datasize = 2*numvertices*sizeof(float[4]);
5330 DPSOFTRAST_Command_Draw *command;
5331 unsigned char *data;
5332 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5334 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5335 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5337 datasize += numvertices*sizeof(float[4]);
5340 datasize += numtriangles*sizeof(unsigned short[3]);
5342 datasize += numtriangles*sizeof(int[3]);
5343 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5344 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5346 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5347 data = (unsigned char *)MM_CALLOC(datasize, 1);
5351 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5352 data = (unsigned char *)command + commandsize;
5354 command->firstvertex = firstvertex;
5355 command->numvertices = numvertices;
5356 command->numtriangles = numtriangles;
5357 command->arrays = (float *)data;
5358 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5359 dpsoftrast.firstvertex = firstvertex;
5360 dpsoftrast.numvertices = numvertices;
5361 dpsoftrast.screencoord4f = (float *)data;
5362 data += numvertices*sizeof(float[4]);
5363 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5364 data += numvertices*sizeof(float[4]);
5365 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5367 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5368 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5370 dpsoftrast.post_array4f[j] = (float *)data;
5371 data += numvertices*sizeof(float[4]);
5373 command->element3i = NULL;
5374 command->element3s = NULL;
5377 command->element3s = (unsigned short *)data;
5378 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5382 command->element3i = (int *)data;
5383 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5388 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5390 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5391 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5392 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5393 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5394 if (command->starty >= command->endy)
5396 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5397 MM_FREE(command->arrays);
5398 DPSOFTRAST_UndoCommand(command->commandsize);
5401 command->clipped = dpsoftrast.drawclipped;
5402 command->refcount = dpsoftrast.numthreads;
5404 if (dpsoftrast.usethreads)
5407 DPSOFTRAST_Draw_SyncCommands();
5408 for (i = 0; i < dpsoftrast.numthreads; i++)
5410 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5411 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5412 Thread_CondSignal(thread->drawcond);
5417 DPSOFTRAST_Draw_FlushThreads();
5421 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5422 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5424 thread->validate |= DPSOFTRAST_VALIDATE_FB;
5426 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5428 DPSOFTRAST_Command_SetRenderTargets *command;
5429 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5430 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5431 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5433 dpsoftrast.fb_width = width;
5434 dpsoftrast.fb_height = height;
5435 dpsoftrast.fb_depthpixels = depthpixels;
5436 dpsoftrast.fb_colorpixels[0] = colorpixels0;
5437 dpsoftrast.fb_colorpixels[1] = colorpixels1;
5438 dpsoftrast.fb_colorpixels[2] = colorpixels2;
5439 dpsoftrast.fb_colorpixels[3] = colorpixels3;
5440 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5441 command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5442 command->width = width;
5443 command->height = height;
5446 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5448 int commandoffset = thread->commandoffset;
5449 while (commandoffset != endoffset)
5451 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5452 switch (command->opcode)
5454 #define INTERPCOMMAND(name) \
5455 case DPSOFTRAST_OPCODE_##name : \
5456 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5457 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5458 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5459 commandoffset = 0; \
5461 INTERPCOMMAND(Viewport)
5462 INTERPCOMMAND(ClearColor)
5463 INTERPCOMMAND(ClearDepth)
5464 INTERPCOMMAND(ColorMask)
5465 INTERPCOMMAND(DepthTest)
5466 INTERPCOMMAND(ScissorTest)
5467 INTERPCOMMAND(Scissor)
5468 INTERPCOMMAND(BlendFunc)
5469 INTERPCOMMAND(BlendSubtract)
5470 INTERPCOMMAND(DepthMask)
5471 INTERPCOMMAND(DepthFunc)
5472 INTERPCOMMAND(DepthRange)
5473 INTERPCOMMAND(PolygonOffset)
5474 INTERPCOMMAND(CullFace)
5475 INTERPCOMMAND(AlphaTest)
5476 INTERPCOMMAND(AlphaFunc)
5477 INTERPCOMMAND(SetTexture)
5478 INTERPCOMMAND(SetShader)
5479 INTERPCOMMAND(Uniform4f)
5480 INTERPCOMMAND(UniformMatrix4f)
5481 INTERPCOMMAND(Uniform1i)
5482 INTERPCOMMAND(SetRenderTargets)
5483 INTERPCOMMAND(ClipPlane)
5485 case DPSOFTRAST_OPCODE_Draw:
5486 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5487 commandoffset += command->commandsize;
5488 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5490 thread->commandoffset = commandoffset;
5493 case DPSOFTRAST_OPCODE_Reset:
5498 thread->commandoffset = commandoffset;
5501 static int DPSOFTRAST_Draw_Thread(void *data)
5503 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5504 while(thread->index >= 0)
5506 if (thread->commandoffset != dpsoftrast.drawcommand)
5508 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5512 Thread_LockMutex(thread->drawmutex);
5513 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5515 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5516 thread->starving = true;
5517 Thread_CondWait(thread->drawcond, thread->drawmutex);
5518 thread->starving = false;
5520 Thread_UnlockMutex(thread->drawmutex);
5526 static void DPSOFTRAST_Draw_FlushThreads(void)
5528 DPSOFTRAST_State_Thread *thread;
5530 DPSOFTRAST_Draw_SyncCommands();
5531 if (dpsoftrast.usethreads)
5533 for (i = 0; i < dpsoftrast.numthreads; i++)
5535 thread = &dpsoftrast.threads[i];
5536 if (thread->commandoffset != dpsoftrast.drawcommand)
5538 Thread_LockMutex(thread->drawmutex);
5539 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5540 Thread_CondSignal(thread->drawcond);
5541 Thread_UnlockMutex(thread->drawmutex);
5544 for (i = 0; i < dpsoftrast.numthreads; i++)
5546 thread = &dpsoftrast.threads[i];
5547 if (thread->commandoffset != dpsoftrast.drawcommand)
5549 Thread_LockMutex(thread->drawmutex);
5550 if (thread->commandoffset != dpsoftrast.drawcommand)
5552 thread->waiting = true;
5553 Thread_CondWait(thread->waitcond, thread->drawmutex);
5554 thread->waiting = false;
5556 Thread_UnlockMutex(thread->drawmutex);
5562 for (i = 0; i < dpsoftrast.numthreads; i++)
5564 thread = &dpsoftrast.threads[i];
5565 if (thread->commandoffset != dpsoftrast.drawcommand)
5566 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5569 dpsoftrast.commandpool.usedcommands = 0;
5572 void DPSOFTRAST_Flush(void)
5574 DPSOFTRAST_Draw_FlushThreads();
5577 void DPSOFTRAST_Finish(void)
5582 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5592 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5593 dpsoftrast.bigendian = u.b[3];
5594 dpsoftrast.fb_width = width;
5595 dpsoftrast.fb_height = height;
5596 dpsoftrast.fb_depthpixels = depthpixels;
5597 dpsoftrast.fb_colorpixels[0] = colorpixels;
5598 dpsoftrast.fb_colorpixels[1] = NULL;
5599 dpsoftrast.fb_colorpixels[1] = NULL;
5600 dpsoftrast.fb_colorpixels[1] = NULL;
5601 dpsoftrast.viewport[0] = 0;
5602 dpsoftrast.viewport[1] = 0;
5603 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5604 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5605 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5606 dpsoftrast.texture_firstfree = 1;
5607 dpsoftrast.texture_end = 1;
5608 dpsoftrast.texture_max = 0;
5609 dpsoftrast.color[0] = 1;
5610 dpsoftrast.color[1] = 1;
5611 dpsoftrast.color[2] = 1;
5612 dpsoftrast.color[3] = 1;
5613 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5614 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5615 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5616 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5617 for (i = 0; i < dpsoftrast.numthreads; i++)
5619 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5621 thread->cullface = GL_BACK;
5622 thread->colormask[0] = 1;
5623 thread->colormask[1] = 1;
5624 thread->colormask[2] = 1;
5625 thread->colormask[3] = 1;
5626 thread->blendfunc[0] = GL_ONE;
5627 thread->blendfunc[1] = GL_ZERO;
5628 thread->depthmask = true;
5629 thread->depthtest = true;
5630 thread->depthfunc = GL_LEQUAL;
5631 thread->scissortest = false;
5632 thread->alphatest = false;
5633 thread->alphafunc = GL_GREATER;
5634 thread->alphavalue = 0.5f;
5635 thread->viewport[0] = 0;
5636 thread->viewport[1] = 0;
5637 thread->viewport[2] = dpsoftrast.fb_width;
5638 thread->viewport[3] = dpsoftrast.fb_height;
5639 thread->scissor[0] = 0;
5640 thread->scissor[1] = 0;
5641 thread->scissor[2] = dpsoftrast.fb_width;
5642 thread->scissor[3] = dpsoftrast.fb_height;
5643 thread->depthrange[0] = 0;
5644 thread->depthrange[1] = 1;
5645 thread->polygonoffset[0] = 0;
5646 thread->polygonoffset[1] = 0;
5647 thread->clipplane[0] = 0;
5648 thread->clipplane[1] = 0;
5649 thread->clipplane[2] = 0;
5650 thread->clipplane[3] = 1;
5652 thread->numspans = 0;
5653 thread->numtriangles = 0;
5654 thread->commandoffset = 0;
5655 thread->waiting = false;
5656 thread->starving = false;
5658 thread->validate = -1;
5659 DPSOFTRAST_Validate(thread, -1);
5661 if (dpsoftrast.usethreads)
5663 thread->waitcond = Thread_CreateCond();
5664 thread->drawcond = Thread_CreateCond();
5665 thread->drawmutex = Thread_CreateMutex();
5666 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5672 void DPSOFTRAST_Shutdown(void)
5675 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5677 DPSOFTRAST_State_Thread *thread;
5678 for (i = 0; i < dpsoftrast.numthreads; i++)
5680 thread = &dpsoftrast.threads[i];
5681 Thread_LockMutex(thread->drawmutex);
5683 Thread_CondSignal(thread->drawcond);
5684 Thread_UnlockMutex(thread->drawmutex);
5685 Thread_WaitThread(thread->thread, 0);
5686 Thread_DestroyCond(thread->waitcond);
5687 Thread_DestroyCond(thread->drawcond);
5688 Thread_DestroyMutex(thread->drawmutex);
5691 for (i = 0;i < dpsoftrast.texture_end;i++)
5692 if (dpsoftrast.texture[i].bytes)
5693 MM_FREE(dpsoftrast.texture[i].bytes);
5694 if (dpsoftrast.texture)
5695 free(dpsoftrast.texture);
5696 if (dpsoftrast.threads)
5697 MM_FREE(dpsoftrast.threads);
5698 memset(&dpsoftrast, 0, sizeof(dpsoftrast));