3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 #pragma warning(disable : 4324)
14 typedef qboolean bool;
18 #define ATOMIC_SIZE 32
21 #if defined(__APPLE__)
22 #include <libkern/OSAtomic.h>
23 #define ALIGN(var) var __attribute__((__aligned__(16)))
24 #define ATOMIC(var) var __attribute__((__aligned__(32)))
25 #define MEMORY_BARRIER (_mm_sfence())
26 #define ATOMIC_COUNTER volatile int32_t
27 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30 #elif defined(__GNUC__)
31 #define ALIGN(var) var __attribute__((__aligned__(16)))
32 #define ATOMIC(var) var __attribute__((__aligned__(32)))
33 #define MEMORY_BARRIER (_mm_sfence())
34 //(__sync_synchronize())
35 #define ATOMIC_COUNTER volatile int
36 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
37 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
38 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
39 #elif defined(_MSC_VER)
40 #define ALIGN(var) __declspec(align(16)) var
41 #define ATOMIC(var) __declspec(align(32)) var
42 #define MEMORY_BARRIER (_mm_sfence())
44 #define ATOMIC_COUNTER volatile LONG
45 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
46 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
47 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
52 #define ALIGN(var) var
55 #define ATOMIC(var) var
57 #ifndef MEMORY_BARRIER
58 #define MEMORY_BARRIER ((void)0)
60 #ifndef ATOMIC_COUNTER
61 #define ATOMIC_COUNTER int
63 #ifndef ATOMIC_INCREMENT
64 #define ATOMIC_INCREMENT(counter) (++(counter))
66 #ifndef ATOMIC_DECREMENT
67 #define ATOMIC_DECREMENT(counter) (--(counter))
70 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
74 #include <emmintrin.h>
76 #if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6)
77 #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
80 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
82 static void *MM_CALLOC(size_t nmemb, size_t size)
84 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
85 if (ptr != NULL) memset(ptr, 0, nmemb*size);
89 #define MM_FREE _mm_free
91 #define MM_MALLOC(size) malloc(size)
92 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
96 typedef enum DPSOFTRAST_ARRAY_e
98 DPSOFTRAST_ARRAY_POSITION,
99 DPSOFTRAST_ARRAY_COLOR,
100 DPSOFTRAST_ARRAY_TEXCOORD0,
101 DPSOFTRAST_ARRAY_TEXCOORD1,
102 DPSOFTRAST_ARRAY_TEXCOORD2,
103 DPSOFTRAST_ARRAY_TEXCOORD3,
104 DPSOFTRAST_ARRAY_TEXCOORD4,
105 DPSOFTRAST_ARRAY_TEXCOORD5,
106 DPSOFTRAST_ARRAY_TEXCOORD6,
107 DPSOFTRAST_ARRAY_TEXCOORD7,
108 DPSOFTRAST_ARRAY_TOTAL
112 typedef struct DPSOFTRAST_Texture_s
119 DPSOFTRAST_TEXTURE_FILTER filter;
122 ATOMIC_COUNTER binds;
123 unsigned char *bytes;
124 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
128 #define COMMAND_SIZE ALIGN_SIZE
129 #define COMMAND_ALIGN(var) ALIGN(var)
131 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
133 unsigned char opcode;
134 unsigned short commandsize;
138 enum { DPSOFTRAST_OPCODE_Reset = 0 };
140 #define DEFCOMMAND(opcodeval, name, fields) \
141 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
142 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
144 unsigned char opcode; \
145 unsigned short commandsize; \
147 } DPSOFTRAST_Command_##name );
149 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
150 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
152 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
156 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
158 DPSOFTRAST_State_Command_Pool);
160 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
162 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
164 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
166 DPSOFTRAST_State_Triangle);
168 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
169 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
170 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
171 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
172 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
174 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
175 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
176 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
177 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
178 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
179 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
180 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
181 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
182 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
185 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
187 typedef ALIGN(struct DPSOFTRAST_State_Span_s
189 int triangle; // triangle this span was generated by
190 int x; // framebuffer x coord
191 int y; // framebuffer y coord
192 int startx; // usable range (according to pixelmask)
193 int endx; // usable range (according to pixelmask)
194 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
195 int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
196 int depthslope; // depthbuffer value pixel delta
198 DPSOFTRAST_State_Span);
200 #define DPSOFTRAST_DRAW_MAXSPANS 1024
201 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
202 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
204 #define DPSOFTRAST_VALIDATE_FB 1
205 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
206 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
207 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
209 typedef enum DPSOFTRAST_BLENDMODE_e
211 DPSOFTRAST_BLENDMODE_OPAQUE,
212 DPSOFTRAST_BLENDMODE_ALPHA,
213 DPSOFTRAST_BLENDMODE_ADDALPHA,
214 DPSOFTRAST_BLENDMODE_ADD,
215 DPSOFTRAST_BLENDMODE_INVMOD,
216 DPSOFTRAST_BLENDMODE_MUL,
217 DPSOFTRAST_BLENDMODE_MUL2,
218 DPSOFTRAST_BLENDMODE_SUBALPHA,
219 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
220 DPSOFTRAST_BLENDMODE_INVADD,
221 DPSOFTRAST_BLENDMODE_TOTAL
223 DPSOFTRAST_BLENDMODE;
225 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
244 float polygonoffset[2];
246 ALIGN(float fb_clipplane[4]);
249 int shader_permutation;
250 int shader_exactspecularmath;
252 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
254 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
255 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
257 // DPSOFTRAST_VALIDATE_ flags
260 // derived values (DPSOFTRAST_VALIDATE_FB)
263 ALIGN(float fb_viewportcenter[4]);
264 ALIGN(float fb_viewportscale[4]);
266 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
269 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
278 ATOMIC(volatile int commandoffset);
280 volatile bool waiting;
281 volatile bool starving;
288 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
289 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
290 unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
292 DPSOFTRAST_State_Thread);
294 typedef ATOMIC(struct DPSOFTRAST_State_s
298 unsigned int *fb_depthpixels;
299 unsigned int *fb_colorpixels[4];
302 ALIGN(float fb_viewportcenter[4]);
303 ALIGN(float fb_viewportscale[4]);
306 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
307 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
309 const float *pointer_vertex3f;
310 const float *pointer_color4f;
311 const unsigned char *pointer_color4ub;
312 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
315 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
316 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
317 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
321 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
322 float *screencoord4f;
328 int shader_permutation;
329 int shader_exactspecularmath;
333 int texture_firstfree;
334 DPSOFTRAST_Texture *texture;
339 const char *errorstring;
344 DPSOFTRAST_State_Thread *threads;
346 ATOMIC(volatile int drawcommand);
348 DPSOFTRAST_State_Command_Pool commandpool;
352 DPSOFTRAST_State dpsoftrast;
354 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
355 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
356 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
357 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
359 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
360 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
362 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
364 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
365 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
366 fb_viewportcenter[3] = 0.5f;
367 fb_viewportcenter[0] = 0.0f;
368 fb_viewportscale[1] = 0.5f * viewport[2];
369 fb_viewportscale[2] = -0.5f * viewport[3];
370 fb_viewportscale[3] = 0.5f;
371 fb_viewportscale[0] = 1.0f;
374 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
376 if (dpsoftrast.interlace)
378 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
379 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
380 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
381 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
385 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
386 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
390 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
392 thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
393 thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
394 thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
395 thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
396 thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
399 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
401 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
402 // and viewport projection values
405 x1 = thread->scissor[0];
406 x2 = thread->scissor[0] + thread->scissor[2];
407 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
408 y2 = dpsoftrast.fb_height - thread->scissor[1];
409 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
411 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
413 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
414 thread->fb_scissor[0] = x1;
415 thread->fb_scissor[1] = y1;
416 thread->fb_scissor[2] = x2 - x1;
417 thread->fb_scissor[3] = y2 - y1;
419 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
420 DPSOFTRAST_RecalcClipPlane(thread);
421 DPSOFTRAST_RecalcThread(thread);
424 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
426 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
429 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
431 if (thread->blendsubtract)
433 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
435 #define BLENDFUNC(sfactor, dfactor, blendmode) \
436 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
437 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
438 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
443 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
445 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
446 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
447 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
448 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
449 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
450 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
451 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
452 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
453 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
454 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
455 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
460 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
462 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
464 mask &= thread->validate;
467 if (mask & DPSOFTRAST_VALIDATE_FB)
469 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
470 DPSOFTRAST_RecalcFB(thread);
472 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
474 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
475 DPSOFTRAST_RecalcDepthFunc(thread);
477 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
479 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
480 DPSOFTRAST_RecalcBlendFunc(thread);
484 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
486 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
487 return &dpsoftrast.texture[index];
491 static void DPSOFTRAST_Texture_Grow(void)
493 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
494 DPSOFTRAST_State_Thread *thread;
498 // expand texture array as needed
499 if (dpsoftrast.texture_max < 1024)
500 dpsoftrast.texture_max = 1024;
502 dpsoftrast.texture_max *= 2;
503 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
504 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
505 if (dpsoftrast.texbound[i])
506 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
507 for (j = 0; j < dpsoftrast.numthreads; j++)
509 thread = &dpsoftrast.threads[j];
510 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
511 if (thread->texbound[i])
512 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
516 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
525 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
526 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
527 DPSOFTRAST_Texture *texture;
528 if (width*height*depth < 1)
530 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
533 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
535 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
540 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
541 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
542 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
544 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
545 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
547 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
552 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
555 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
557 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
562 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
564 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
567 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
569 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
572 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
574 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
577 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
579 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
582 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
584 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
587 // find first empty slot in texture array
588 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
589 if (!dpsoftrast.texture[texnum].bytes)
591 dpsoftrast.texture_firstfree = texnum + 1;
592 if (dpsoftrast.texture_max <= texnum)
593 DPSOFTRAST_Texture_Grow();
594 if (dpsoftrast.texture_end <= texnum)
595 dpsoftrast.texture_end = texnum + 1;
596 texture = &dpsoftrast.texture[texnum];
597 memset(texture, 0, sizeof(*texture));
598 texture->flags = flags;
599 texture->width = width;
600 texture->height = height;
601 texture->depth = depth;
602 texture->sides = sides;
614 s = w * h * d * sides * 4;
615 texture->mipmap[mipmaps][0] = size;
616 texture->mipmap[mipmaps][1] = s;
617 texture->mipmap[mipmaps][2] = w;
618 texture->mipmap[mipmaps][3] = h;
619 texture->mipmap[mipmaps][4] = d;
622 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
628 texture->mipmaps = mipmaps;
629 texture->size = size;
631 // allocate the pixels now
632 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
636 void DPSOFTRAST_Texture_Free(int index)
638 DPSOFTRAST_Texture *texture;
639 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
643 MM_FREE(texture->bytes);
644 texture->bytes = NULL;
645 memset(texture, 0, sizeof(*texture));
646 // adjust the free range and used range
647 if (dpsoftrast.texture_firstfree > index)
648 dpsoftrast.texture_firstfree = index;
649 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
650 dpsoftrast.texture_end--;
652 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
654 int i, x, y, z, w, layer0, layer1, row0, row1;
655 unsigned char *o, *i0, *i1, *i2, *i3;
656 DPSOFTRAST_Texture *texture;
657 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
658 if (texture->mipmaps <= 1)
660 for (i = 1;i < texture->mipmaps;i++)
662 for (z = 0;z < texture->mipmap[i][4];z++)
666 if (layer1 >= texture->mipmap[i-1][4])
667 layer1 = texture->mipmap[i-1][4]-1;
668 for (y = 0;y < texture->mipmap[i][3];y++)
672 if (row1 >= texture->mipmap[i-1][3])
673 row1 = texture->mipmap[i-1][3]-1;
674 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
675 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
676 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
677 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
678 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
679 w = texture->mipmap[i][2];
682 if (texture->mipmap[i-1][2] > 1)
684 // average 3D texture
685 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
687 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
688 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
689 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
690 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
695 // average 3D mipmap with parent width == 1
696 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
698 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
699 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
700 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
701 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
707 if (texture->mipmap[i-1][2] > 1)
709 // average 2D texture (common case)
710 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
712 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
713 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
714 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
715 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
720 // 2D texture with parent width == 1
721 o[0] = (i0[0] + i1[0] + 1) >> 1;
722 o[1] = (i0[1] + i1[1] + 1) >> 1;
723 o[2] = (i0[2] + i1[2] + 1) >> 1;
724 o[3] = (i0[3] + i1[3] + 1) >> 1;
731 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
733 DPSOFTRAST_Texture *texture;
735 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
740 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
741 while (blockheight > 0)
743 memcpy(dst, pixels, blockwidth * 4);
744 pixels += blockwidth * 4;
745 dst += texture->mipmap[0][2] * 4;
749 DPSOFTRAST_Texture_CalculateMipmaps(index);
751 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
753 DPSOFTRAST_Texture *texture;
754 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
758 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
759 DPSOFTRAST_Texture_CalculateMipmaps(index);
761 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
763 DPSOFTRAST_Texture *texture;
764 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
765 return texture->mipmap[mip][2];
767 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
769 DPSOFTRAST_Texture *texture;
770 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
771 return texture->mipmap[mip][3];
773 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
775 DPSOFTRAST_Texture *texture;
776 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
777 return texture->mipmap[mip][4];
779 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
781 DPSOFTRAST_Texture *texture;
782 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
785 return texture->bytes + texture->mipmap[mip][0];
787 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
789 DPSOFTRAST_Texture *texture;
790 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
791 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
793 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
798 texture->filter = filter;
801 static void DPSOFTRAST_Draw_FlushThreads(void);
803 static void DPSOFTRAST_Draw_SyncCommands(void)
805 if(dpsoftrast.usethreads) MEMORY_BARRIER;
806 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
809 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
811 DPSOFTRAST_State_Thread *thread;
813 int freecommand = dpsoftrast.commandpool.freecommand;
814 int usedcommands = dpsoftrast.commandpool.usedcommands;
815 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
817 DPSOFTRAST_Draw_SyncCommands();
823 for (i = 0; i < dpsoftrast.numthreads; i++)
825 thread = &dpsoftrast.threads[i];
826 commandoffset = freecommand - thread->commandoffset;
827 if (commandoffset < 0)
828 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
829 if (commandoffset > usedcommands)
832 usedcommands = commandoffset;
835 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
837 thread = &dpsoftrast.threads[waitindex];
838 Thread_LockMutex(thread->drawmutex);
839 if (thread->commandoffset != dpsoftrast.drawcommand)
841 thread->waiting = true;
842 if (thread->starving) Thread_CondSignal(thread->drawcond);
843 Thread_CondWait(thread->waitcond, thread->drawmutex);
844 thread->waiting = false;
846 Thread_UnlockMutex(thread->drawmutex);
848 dpsoftrast.commandpool.usedcommands = usedcommands;
851 #define DPSOFTRAST_ALIGNCOMMAND(size) \
852 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
853 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
854 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
856 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
858 DPSOFTRAST_Command *command;
859 int freecommand = dpsoftrast.commandpool.freecommand;
860 int usedcommands = dpsoftrast.commandpool.usedcommands;
861 int extra = sizeof(DPSOFTRAST_Command);
862 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
863 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
864 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
866 if (dpsoftrast.usethreads)
867 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
869 DPSOFTRAST_Draw_FlushThreads();
870 freecommand = dpsoftrast.commandpool.freecommand;
871 usedcommands = dpsoftrast.commandpool.usedcommands;
873 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
875 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
876 command->opcode = DPSOFTRAST_OPCODE_Reset;
877 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
880 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
881 command->opcode = opcode;
882 command->commandsize = size;
884 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
886 dpsoftrast.commandpool.freecommand = freecommand;
887 dpsoftrast.commandpool.usedcommands = usedcommands + size;
891 static void DPSOFTRAST_UndoCommand(int size)
893 int freecommand = dpsoftrast.commandpool.freecommand;
894 int usedcommands = dpsoftrast.commandpool.usedcommands;
897 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
898 usedcommands -= size;
899 dpsoftrast.commandpool.freecommand = freecommand;
900 dpsoftrast.commandpool.usedcommands = usedcommands;
903 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
904 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
906 thread->viewport[0] = command->x;
907 thread->viewport[1] = command->y;
908 thread->viewport[2] = command->width;
909 thread->viewport[3] = command->height;
910 thread->validate |= DPSOFTRAST_VALIDATE_FB;
912 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
914 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
917 command->width = width;
918 command->height = height;
920 dpsoftrast.viewport[0] = x;
921 dpsoftrast.viewport[1] = y;
922 dpsoftrast.viewport[2] = width;
923 dpsoftrast.viewport[3] = height;
924 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
927 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
928 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
930 int i, x1, y1, x2, y2, w, h, x, y;
931 int miny1, maxy1, miny2, maxy2;
935 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
936 miny1 = thread->miny1;
937 maxy1 = thread->maxy1;
938 miny2 = thread->miny2;
939 maxy2 = thread->maxy2;
940 x1 = thread->fb_scissor[0];
941 y1 = thread->fb_scissor[1];
942 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
943 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
944 if (y1 < miny1) y1 = miny1;
945 if (y2 > maxy2) y2 = maxy2;
950 // FIXME: honor fb_colormask?
951 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
952 for (i = 0;i < 4;i++)
954 if (!dpsoftrast.fb_colorpixels[i])
956 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
959 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
960 for (x = x1;x < x2;x++)
965 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
967 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
974 DEFCOMMAND(3, ClearDepth, float depth;)
975 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
977 int x1, y1, x2, y2, w, h, x, y;
978 int miny1, maxy1, miny2, maxy2;
982 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
983 miny1 = thread->miny1;
984 maxy1 = thread->maxy1;
985 miny2 = thread->miny2;
986 maxy2 = thread->maxy2;
987 x1 = thread->fb_scissor[0];
988 y1 = thread->fb_scissor[1];
989 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
990 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
991 if (y1 < miny1) y1 = miny1;
992 if (y2 > maxy2) y2 = maxy2;
997 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
998 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
1001 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
1002 for (x = x1;x < x2;x++)
1006 void DPSOFTRAST_ClearDepth(float d)
1008 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1012 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1013 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1015 thread->colormask[0] = command->r != 0;
1016 thread->colormask[1] = command->g != 0;
1017 thread->colormask[2] = command->b != 0;
1018 thread->colormask[3] = command->a != 0;
1019 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1021 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1023 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1030 DEFCOMMAND(5, DepthTest, int enable;)
1031 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1033 thread->depthtest = command->enable;
1034 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1036 void DPSOFTRAST_DepthTest(int enable)
1038 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1039 command->enable = enable;
1042 DEFCOMMAND(6, ScissorTest, int enable;)
1043 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1045 thread->scissortest = command->enable;
1046 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1048 void DPSOFTRAST_ScissorTest(int enable)
1050 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1051 command->enable = enable;
1054 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1055 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1057 thread->scissor[0] = command->x;
1058 thread->scissor[1] = command->y;
1059 thread->scissor[2] = command->width;
1060 thread->scissor[3] = command->height;
1061 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1063 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1065 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1068 command->width = width;
1069 command->height = height;
1072 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1073 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1075 thread->blendfunc[0] = command->sfactor;
1076 thread->blendfunc[1] = command->dfactor;
1077 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1079 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1081 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1082 command->sfactor = sfactor;
1083 command->dfactor = dfactor;
1086 DEFCOMMAND(9, BlendSubtract, int enable;)
1087 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1089 thread->blendsubtract = command->enable;
1090 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1092 void DPSOFTRAST_BlendSubtract(int enable)
1094 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1095 command->enable = enable;
1098 DEFCOMMAND(10, DepthMask, int enable;)
1099 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1101 thread->depthmask = command->enable;
1103 void DPSOFTRAST_DepthMask(int enable)
1105 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1106 command->enable = enable;
1109 DEFCOMMAND(11, DepthFunc, int func;)
1110 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1112 thread->depthfunc = command->func;
1114 void DPSOFTRAST_DepthFunc(int func)
1116 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1117 command->func = func;
1120 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1121 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1123 thread->depthrange[0] = command->nearval;
1124 thread->depthrange[1] = command->farval;
1126 void DPSOFTRAST_DepthRange(float nearval, float farval)
1128 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1129 command->nearval = nearval;
1130 command->farval = farval;
1133 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1134 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1136 thread->polygonoffset[0] = command->alongnormal;
1137 thread->polygonoffset[1] = command->intoview;
1139 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1141 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1142 command->alongnormal = alongnormal;
1143 command->intoview = intoview;
1146 DEFCOMMAND(14, CullFace, int mode;)
1147 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1149 thread->cullface = command->mode;
1151 void DPSOFTRAST_CullFace(int mode)
1153 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1154 command->mode = mode;
1157 DEFCOMMAND(15, AlphaTest, int enable;)
1158 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1160 thread->alphatest = command->enable;
1162 void DPSOFTRAST_AlphaTest(int enable)
1164 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1165 command->enable = enable;
1168 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1169 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1171 thread->alphafunc = command->func;
1172 thread->alphavalue = command->ref;
1174 void DPSOFTRAST_AlphaFunc(int func, float ref)
1176 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1177 command->func = func;
1181 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1183 dpsoftrast.color[0] = r;
1184 dpsoftrast.color[1] = g;
1185 dpsoftrast.color[2] = b;
1186 dpsoftrast.color[3] = a;
1189 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1191 int outstride = blockwidth * 4;
1192 int instride = dpsoftrast.fb_width * 4;
1195 int bx2 = blockx + blockwidth;
1196 int by2 = blocky + blockheight;
1200 unsigned char *inpixels;
1204 if (bx1 < 0) bx1 = 0;
1205 if (by1 < 0) by1 = 0;
1206 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1207 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1209 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1210 if (dpsoftrast.bigendian)
1212 for (y = by1;y < by2;y++)
1214 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1215 o = (unsigned char *)outpixels + (y - by1) * outstride;
1216 for (x = bx1;x < bx2;x++)
1229 for (y = by1;y < by2;y++)
1231 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1232 o = (unsigned char *)outpixels + (y - by1) * outstride;
1238 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1242 int tx2 = tx + width;
1243 int ty2 = ty + height;
1246 int sx2 = sx + width;
1247 int sy2 = sy + height;
1257 unsigned int *spixels;
1258 unsigned int *tpixels;
1259 DPSOFTRAST_Texture *texture;
1260 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1261 if (mip < 0 || mip >= texture->mipmaps) return;
1263 spixels = dpsoftrast.fb_colorpixels[0];
1264 swidth = dpsoftrast.fb_width;
1265 sheight = dpsoftrast.fb_height;
1266 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1267 twidth = texture->mipmap[mip][2];
1268 theight = texture->mipmap[mip][3];
1269 if (tx1 < 0) tx1 = 0;
1270 if (ty1 < 0) ty1 = 0;
1271 if (tx2 > twidth) tx2 = twidth;
1272 if (ty2 > theight) ty2 = theight;
1273 if (sx1 < 0) sx1 = 0;
1274 if (sy1 < 0) sy1 = 0;
1275 if (sx2 > swidth) sx2 = swidth;
1276 if (sy2 > sheight) sy2 = sheight;
1281 if (tw > sw) tw = sw;
1282 if (th > sh) th = sh;
1283 if (tw < 1 || th < 1)
1285 sy1 = sheight - 1 - sy1;
1286 for (y = 0;y < th;y++)
1287 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1288 if (texture->mipmaps > 1)
1289 DPSOFTRAST_Texture_CalculateMipmaps(index);
1292 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1293 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1295 if (thread->texbound[command->unitnum])
1296 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1297 thread->texbound[command->unitnum] = command->texture;
1299 void DPSOFTRAST_SetTexture(int unitnum, int index)
1301 DPSOFTRAST_Command_SetTexture *command;
1302 DPSOFTRAST_Texture *texture;
1303 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1305 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1308 texture = DPSOFTRAST_Texture_GetByIndex(index);
1309 if (index && !texture)
1311 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1315 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1316 command->unitnum = unitnum;
1317 command->texture = texture;
1319 dpsoftrast.texbound[unitnum] = texture;
1320 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1323 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1325 dpsoftrast.pointer_vertex3f = vertex3f;
1326 dpsoftrast.stride_vertex = stride;
1328 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1330 dpsoftrast.pointer_color4f = color4f;
1331 dpsoftrast.pointer_color4ub = NULL;
1332 dpsoftrast.stride_color = stride;
1334 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1336 dpsoftrast.pointer_color4f = NULL;
1337 dpsoftrast.pointer_color4ub = color4ub;
1338 dpsoftrast.stride_color = stride;
1340 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1342 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1343 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1344 dpsoftrast.stride_texcoord[unitnum] = stride;
1347 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1348 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1350 thread->shader_mode = command->mode;
1351 thread->shader_permutation = command->permutation;
1352 thread->shader_exactspecularmath = command->exactspecularmath;
1354 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1356 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1357 command->mode = mode;
1358 command->permutation = permutation;
1359 command->exactspecularmath = exactspecularmath;
1361 dpsoftrast.shader_mode = mode;
1362 dpsoftrast.shader_permutation = permutation;
1363 dpsoftrast.shader_exactspecularmath = exactspecularmath;
1366 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1367 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1369 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1371 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1373 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1374 command->index = index;
1375 command->val[0] = v0;
1376 command->val[1] = v1;
1377 command->val[2] = v2;
1378 command->val[3] = v3;
1380 dpsoftrast.uniform4f[index*4+0] = v0;
1381 dpsoftrast.uniform4f[index*4+1] = v1;
1382 dpsoftrast.uniform4f[index*4+2] = v2;
1383 dpsoftrast.uniform4f[index*4+3] = v3;
1385 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1387 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1388 command->index = index;
1389 memcpy(command->val, v, sizeof(command->val));
1391 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1394 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1395 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1397 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1399 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1403 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1405 __m128 m0, m1, m2, m3;
1406 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1407 command->index = (DPSOFTRAST_UNIFORM)index;
1408 if (((size_t)v)&(ALIGN_SIZE-1))
1410 m0 = _mm_loadu_ps(v);
1411 m1 = _mm_loadu_ps(v+4);
1412 m2 = _mm_loadu_ps(v+8);
1413 m3 = _mm_loadu_ps(v+12);
1417 m0 = _mm_load_ps(v);
1418 m1 = _mm_load_ps(v+4);
1419 m2 = _mm_load_ps(v+8);
1420 m3 = _mm_load_ps(v+12);
1424 __m128 t0, t1, t2, t3;
1425 t0 = _mm_unpacklo_ps(m0, m1);
1426 t1 = _mm_unpacklo_ps(m2, m3);
1427 t2 = _mm_unpackhi_ps(m0, m1);
1428 t3 = _mm_unpackhi_ps(m2, m3);
1429 m0 = _mm_movelh_ps(t0, t1);
1430 m1 = _mm_movehl_ps(t1, t0);
1431 m2 = _mm_movelh_ps(t2, t3);
1432 m3 = _mm_movehl_ps(t3, t2);
1434 _mm_store_ps(command->val, m0);
1435 _mm_store_ps(command->val+4, m1);
1436 _mm_store_ps(command->val+8, m2);
1437 _mm_store_ps(command->val+12, m3);
1438 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1439 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1440 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1441 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1446 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1447 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1449 thread->uniform1i[command->index] = command->val;
1451 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1453 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1454 command->index = index;
1457 dpsoftrast.uniform1i[command->index] = i0;
1460 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1461 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1463 memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1464 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1466 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1468 DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1469 command->clipplane[0] = x;
1470 command->clipplane[1] = y;
1471 command->clipplane[2] = z;
1472 command->clipplane[3] = w;
1476 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1478 float *end = dst + size*4;
1479 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1483 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1492 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1499 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1501 float *end = dst + size*4;
1502 if (stride == sizeof(float[3]))
1504 float *end4 = dst + (size&~3)*4;
1505 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1509 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1510 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1511 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1512 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1513 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1514 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1515 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1516 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1517 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1518 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1519 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1520 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1521 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1523 src += 4*sizeof(float[3]);
1530 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1531 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1532 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1533 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1534 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1535 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1536 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1537 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1538 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1539 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1540 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1541 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1542 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1544 src += 4*sizeof(float[3]);
1548 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1552 __m128 v = _mm_loadu_ps((const float *)src);
1553 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1554 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1555 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1556 _mm_store_ps(dst, v);
1565 __m128 v = _mm_load_ps((const float *)src);
1566 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1567 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1568 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1569 _mm_store_ps(dst, v);
1576 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1578 float *end = dst + size*4;
1579 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1580 if (stride == sizeof(float[2]))
1582 float *end2 = dst + (size&~1)*4;
1583 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1587 __m128 v = _mm_loadu_ps((const float *)src);
1588 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1589 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1591 src += 2*sizeof(float[2]);
1598 __m128 v = _mm_load_ps((const float *)src);
1599 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1600 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1602 src += 2*sizeof(float[2]);
1608 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1614 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1616 float *end = dst + size*4;
1617 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1618 if (stride == sizeof(unsigned char[4]))
1620 float *end4 = dst + (size&~3)*4;
1621 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1625 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1626 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1627 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1628 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1629 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1631 src += 4*sizeof(unsigned char[4]);
1638 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1639 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1640 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1641 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1642 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1644 src += 4*sizeof(unsigned char[4]);
1650 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1651 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1657 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1659 float *end = dst + 4*size;
1660 __m128 v = _mm_loadu_ps(src);
1663 _mm_store_ps(dst, v);
1669 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1672 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1673 __m128 m0, m1, m2, m3;
1675 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1677 // fast case for identity matrix
1678 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1681 end = out4f + numitems*4;
1682 m0 = _mm_loadu_ps(inmatrix16f);
1683 m1 = _mm_loadu_ps(inmatrix16f + 4);
1684 m2 = _mm_loadu_ps(inmatrix16f + 8);
1685 m3 = _mm_loadu_ps(inmatrix16f + 12);
1686 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1690 __m128 v = _mm_loadu_ps(in4f);
1692 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1693 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1694 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1695 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1704 __m128 v = _mm_load_ps(in4f);
1706 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1707 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1708 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1709 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1717 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1719 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1723 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1725 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1726 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1727 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1728 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1731 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1733 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1734 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1735 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1736 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1739 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1742 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1743 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1744 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1745 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1748 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1750 int clipmask = 0xFF;
1751 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1752 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1753 __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1754 __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1755 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1756 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1757 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1758 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1759 #define BBFRONT(k, pos) \
1761 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1762 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1763 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1766 clipmask &= ~(1<<k); \
1767 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1768 minproj = _mm_min_ss(minproj, proj); \
1769 maxproj = _mm_max_ss(maxproj, proj); \
1773 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1774 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1775 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1776 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1777 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1778 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1782 if (clipmask&(1<<k)) \
1784 if (!(clipmask&(1<<(k^1)))) \
1786 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1787 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1788 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1789 minproj = _mm_min_ss(minproj, proj); \
1790 maxproj = _mm_max_ss(maxproj, proj); \
1792 if (!(clipmask&(1<<(k^2)))) \
1794 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1795 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1796 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1797 minproj = _mm_min_ss(minproj, proj); \
1798 maxproj = _mm_max_ss(maxproj, proj); \
1800 if (!(clipmask&(1<<(k^4)))) \
1802 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1803 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1804 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1805 minproj = _mm_min_ss(minproj, proj); \
1806 maxproj = _mm_max_ss(maxproj, proj); \
1810 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1811 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1812 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1813 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1814 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1815 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1816 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1817 *starty = _mm_cvttss_si32(maxproj);
1818 *endy = _mm_cvttss_si32(minproj)+1;
1822 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1824 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1825 float *end = out4f + numitems*4;
1826 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1827 __m128 minpos, maxpos;
1828 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1830 minpos = maxpos = _mm_loadu_ps(in4f);
1833 __m128 v = _mm_loadu_ps(in4f);
1834 minpos = _mm_min_ps(minpos, v);
1835 maxpos = _mm_max_ps(maxpos, v);
1836 _mm_store_ps(out4f, v);
1837 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1838 _mm_store_ps(screen4f, v);
1846 minpos = maxpos = _mm_load_ps(in4f);
1849 __m128 v = _mm_load_ps(in4f);
1850 minpos = _mm_min_ps(minpos, v);
1851 maxpos = _mm_max_ps(maxpos, v);
1852 _mm_store_ps(out4f, v);
1853 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1854 _mm_store_ps(screen4f, v);
1862 ALIGN(float minposf[4]);
1863 ALIGN(float maxposf[4]);
1864 _mm_store_ps(minposf, minpos);
1865 _mm_store_ps(maxposf, maxpos);
1866 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1871 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1873 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1874 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1876 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1877 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1878 end = out4f + numitems*4;
1879 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1880 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1881 m0 = _mm_loadu_ps(inmatrix16f);
1882 m1 = _mm_loadu_ps(inmatrix16f + 4);
1883 m2 = _mm_loadu_ps(inmatrix16f + 8);
1884 m3 = _mm_loadu_ps(inmatrix16f + 12);
1885 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1887 minpos = maxpos = _mm_loadu_ps(in4f);
1890 __m128 v = _mm_loadu_ps(in4f);
1891 minpos = _mm_min_ps(minpos, v);
1892 maxpos = _mm_max_ps(maxpos, v);
1893 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1894 _mm_store_ps(out4f, v);
1895 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1896 _mm_store_ps(screen4f, v);
1904 minpos = maxpos = _mm_load_ps(in4f);
1907 __m128 v = _mm_load_ps(in4f);
1908 minpos = _mm_min_ps(minpos, v);
1909 maxpos = _mm_max_ps(maxpos, v);
1910 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1911 _mm_store_ps(out4f, v);
1912 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1913 _mm_store_ps(screen4f, v);
1921 ALIGN(float minposf[4]);
1922 ALIGN(float maxposf[4]);
1923 _mm_store_ps(minposf, minpos);
1924 _mm_store_ps(maxposf, maxpos);
1925 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f);
1931 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1934 float *outf = dpsoftrast.post_array4f[outarray];
1935 const unsigned char *inb;
1936 int firstvertex = dpsoftrast.firstvertex;
1937 int numvertices = dpsoftrast.numvertices;
1941 case DPSOFTRAST_ARRAY_POSITION:
1942 stride = dpsoftrast.stride_vertex;
1943 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1944 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1946 case DPSOFTRAST_ARRAY_COLOR:
1947 stride = dpsoftrast.stride_color;
1948 if (dpsoftrast.pointer_color4f)
1950 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1951 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1953 else if (dpsoftrast.pointer_color4ub)
1955 stride = dpsoftrast.stride_color;
1956 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1957 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1961 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1965 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1966 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1968 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1969 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1972 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1975 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1978 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1990 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1992 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1993 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1998 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
2001 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2002 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
2010 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2013 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2014 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2021 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2024 int startx = span->startx;
2025 int endx = span->endx;
2026 float wslope = triangle->w[0];
2027 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2028 float endz = 1.0f / (w + wslope * startx);
2029 if (triangle->w[0] == 0)
2031 // LordHavoc: fast flat polygons (HUD/menu)
2032 for (x = startx;x < endx;x++)
2036 for (x = startx;x < endx;)
2038 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2040 if (nextsub >= endx) nextsub = endsub = endx-1;
2041 endz = 1.0f / (w + wslope * nextsub);
2042 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2043 for (; x <= endsub; x++, z += dz)
2048 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2052 int startx = span->startx;
2053 int endx = span->endx;
2056 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2057 unsigned char * RESTRICT pixelmask = span->pixelmask;
2058 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2059 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2062 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2063 pixeli += span->y * dpsoftrast.fb_width + span->x;
2064 // handle alphatest now (this affects depth writes too)
2065 if (thread->alphatest)
2066 for (x = startx;x < endx;x++)
2067 if (in4ub[x*4+3] < 128)
2068 pixelmask[x] = false;
2069 // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2070 // helps sprites, text and hud artwork
2071 switch(thread->fb_blendmode)
2073 case DPSOFTRAST_BLENDMODE_ALPHA:
2074 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2075 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2077 for (x = startx;x < endx;x++)
2079 if (in4ub[x*4+3] >= 1)
2084 while (++x < endx && in4ub[x*4+3] >= 1) ;
2086 if (x >= endx) break;
2088 while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2089 if (x >= endx) break;
2096 case DPSOFTRAST_BLENDMODE_OPAQUE:
2097 case DPSOFTRAST_BLENDMODE_ADD:
2098 case DPSOFTRAST_BLENDMODE_INVMOD:
2099 case DPSOFTRAST_BLENDMODE_MUL:
2100 case DPSOFTRAST_BLENDMODE_MUL2:
2101 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2102 case DPSOFTRAST_BLENDMODE_INVADD:
2105 // put some special values at the end of the mask to ensure the loops end
2106 pixelmask[endx] = 1;
2107 pixelmask[endx+1] = 0;
2108 // LordHavoc: use a double loop to identify subspans, this helps the
2109 // optimized copy/blend loops to perform at their best, most triangles
2110 // have only one run of pixels, and do the search using wide reads...
2114 // if this pixel is masked off, it's probably not alone...
2121 // the 4-item search must be aligned or else it stalls badly
2122 if ((x & 3) && !pixelmask[x])
2124 if(pixelmask[x]) goto endmasked;
2128 if(pixelmask[x]) goto endmasked;
2132 if(pixelmask[x]) goto endmasked;
2137 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2141 for (;!pixelmask[x];x++)
2143 // rather than continue the loop, just check the end variable
2148 // find length of subspan
2151 if (subx + 8 < endx)
2155 if(!pixelmask[subx]) goto endunmasked;
2159 if(!pixelmask[subx]) goto endunmasked;
2163 if(!pixelmask[subx]) goto endunmasked;
2168 while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2172 for (;pixelmask[subx];subx++)
2174 // the checks can overshoot, so make sure to clip it...
2178 // now that we know the subspan length... process!
2179 switch(thread->fb_blendmode)
2181 case DPSOFTRAST_BLENDMODE_OPAQUE:
2185 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2190 while (x + 16 <= subx)
2192 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2193 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2194 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2195 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2200 while (x + 4 <= subx)
2202 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2208 pixeli[x+1] = ini[x+1];
2218 case DPSOFTRAST_BLENDMODE_ALPHA:
2219 #define FINISHBLEND(blend2, blend1) \
2220 for (;x + 1 < subx;x += 2) \
2223 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2224 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2226 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2231 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2232 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2234 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2238 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2239 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2241 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2242 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2245 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2247 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2248 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2250 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2251 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2254 case DPSOFTRAST_BLENDMODE_ADD:
2255 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2257 case DPSOFTRAST_BLENDMODE_INVMOD:
2259 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2261 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2264 case DPSOFTRAST_BLENDMODE_MUL:
2265 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2267 case DPSOFTRAST_BLENDMODE_MUL2:
2268 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2270 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2272 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2273 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2275 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2276 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2279 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2281 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2282 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2284 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2285 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2288 case DPSOFTRAST_BLENDMODE_INVADD:
2290 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2292 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2300 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2303 int startx = span->startx;
2304 int endx = span->endx;
2309 float tc[2], endtc[2];
2311 unsigned int tci[2];
2312 unsigned int tci1[2];
2313 unsigned int tcimin[2];
2314 unsigned int tcimax[2];
2319 const unsigned char * RESTRICT pixelbase;
2320 const unsigned char * RESTRICT pixel[4];
2321 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2322 // if no texture is bound, just fill it with white
2325 for (x = startx;x < endx;x++)
2327 out4f[x*4+0] = 1.0f;
2328 out4f[x*4+1] = 1.0f;
2329 out4f[x*4+2] = 1.0f;
2330 out4f[x*4+3] = 1.0f;
2334 mip = triangle->mip[texunitindex];
2335 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2336 // if this mipmap of the texture is 1 pixel, just fill it with that color
2337 if (texture->mipmap[mip][1] == 4)
2339 c[0] = texture->bytes[2] * (1.0f/255.0f);
2340 c[1] = texture->bytes[1] * (1.0f/255.0f);
2341 c[2] = texture->bytes[0] * (1.0f/255.0f);
2342 c[3] = texture->bytes[3] * (1.0f/255.0f);
2343 for (x = startx;x < endx;x++)
2345 out4f[x*4+0] = c[0];
2346 out4f[x*4+1] = c[1];
2347 out4f[x*4+2] = c[2];
2348 out4f[x*4+3] = c[3];
2352 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2353 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2354 flags = texture->flags;
2355 tcscale[0] = texture->mipmap[mip][2];
2356 tcscale[1] = texture->mipmap[mip][3];
2357 tciwidth = texture->mipmap[mip][2];
2360 tcimax[0] = texture->mipmap[mip][2]-1;
2361 tcimax[1] = texture->mipmap[mip][3]-1;
2362 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2363 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2364 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2365 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2371 for (x = startx;x < endx;)
2373 unsigned int subtc[2];
2374 unsigned int substep[2];
2375 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2376 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2377 if (nextsub >= endx)
2379 nextsub = endsub = endx-1;
2380 if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2384 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2385 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2391 substep[0] = (endtc[0] - tc[0]) * subscale;
2392 substep[1] = (endtc[1] - tc[1]) * subscale;
2393 subtc[0] = tc[0] * (1<<12);
2394 subtc[1] = tc[1] * (1<<12);
2397 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2399 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2401 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2402 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2403 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2404 tci[0] = subtc[0]>>12;
2405 tci[1] = subtc[1]>>12;
2406 tci1[0] = tci[0] + 1;
2407 tci1[1] = tci[1] + 1;
2408 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2409 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2410 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2411 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2412 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2413 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2414 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2415 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2416 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2417 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2418 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2419 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2420 out4f[x*4+0] = c[0];
2421 out4f[x*4+1] = c[1];
2422 out4f[x*4+2] = c[2];
2423 out4f[x*4+3] = c[3];
2428 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2430 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2431 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2432 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2433 tci[0] = subtc[0]>>12;
2434 tci[1] = subtc[1]>>12;
2435 tci1[0] = tci[0] + 1;
2436 tci1[1] = tci[1] + 1;
2437 tci[0] &= tciwrapmask[0];
2438 tci[1] &= tciwrapmask[1];
2439 tci1[0] &= tciwrapmask[0];
2440 tci1[1] &= tciwrapmask[1];
2441 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2442 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2443 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2444 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2445 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2446 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2447 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2448 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2449 out4f[x*4+0] = c[0];
2450 out4f[x*4+1] = c[1];
2451 out4f[x*4+2] = c[2];
2452 out4f[x*4+3] = c[3];
2456 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2458 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2460 tci[0] = subtc[0]>>12;
2461 tci[1] = subtc[1]>>12;
2462 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2463 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2464 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2465 c[0] = pixel[0][2] * (1.0f / 255.0f);
2466 c[1] = pixel[0][1] * (1.0f / 255.0f);
2467 c[2] = pixel[0][0] * (1.0f / 255.0f);
2468 c[3] = pixel[0][3] * (1.0f / 255.0f);
2469 out4f[x*4+0] = c[0];
2470 out4f[x*4+1] = c[1];
2471 out4f[x*4+2] = c[2];
2472 out4f[x*4+3] = c[3];
2477 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2479 tci[0] = subtc[0]>>12;
2480 tci[1] = subtc[1]>>12;
2481 tci[0] &= tciwrapmask[0];
2482 tci[1] &= tciwrapmask[1];
2483 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2484 c[0] = pixel[0][2] * (1.0f / 255.0f);
2485 c[1] = pixel[0][1] * (1.0f / 255.0f);
2486 c[2] = pixel[0][0] * (1.0f / 255.0f);
2487 c[3] = pixel[0][3] * (1.0f / 255.0f);
2488 out4f[x*4+0] = c[0];
2489 out4f[x*4+1] = c[1];
2490 out4f[x*4+2] = c[2];
2491 out4f[x*4+3] = c[3];
2497 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2501 int startx = span->startx;
2502 int endx = span->endx;
2504 __m128 data, slope, tcscale;
2505 __m128i tcsize, tcmask, tcoffset, tcmax;
2507 __m128i subtc, substep, endsubtc;
2510 int affine; // LordHavoc: optimized affine texturing case
2511 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2512 const unsigned char * RESTRICT pixelbase;
2513 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2514 // if no texture is bound, just fill it with white
2517 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2520 mip = triangle->mip[texunitindex];
2521 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2522 // if this mipmap of the texture is 1 pixel, just fill it with that color
2523 if (texture->mipmap[mip][1] == 4)
2525 unsigned int k = *((const unsigned int *)pixelbase);
2526 for (x = startx;x < endx;x++)
2530 affine = zf[startx] == zf[endx-1];
2531 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2532 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2533 flags = texture->flags;
2534 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2535 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2536 tcscale = _mm_cvtepi32_ps(tcsize);
2537 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2538 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2539 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2541 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2542 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2543 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2544 tcmax = _mm_packs_epi32(tcmask, tcmask);
2545 for (x = startx;x < endx;)
2547 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2548 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2549 if (nextsub >= endx || affine)
2551 nextsub = endsub = endx-1;
2552 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2556 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2558 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2559 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2560 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2561 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2562 substep = _mm_slli_epi32(substep, 1);
2565 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2566 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2568 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2569 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2571 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2572 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2573 tci = _mm_madd_epi16(tci, tcoffset);
2574 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2575 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2576 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2577 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2578 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2579 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2580 fracm = _mm_srli_epi16(subtc, 1);
2581 pix1 = _mm_add_epi16(pix1,
2582 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2583 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2584 pix3 = _mm_add_epi16(pix3,
2585 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2586 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2587 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2588 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2589 pix2 = _mm_add_epi16(pix2,
2590 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2591 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2592 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2596 const unsigned char * RESTRICT ptr1;
2597 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2598 tci = _mm_madd_epi16(tci, tcoffset);
2599 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2600 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2601 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2602 fracm = _mm_srli_epi16(subtc, 1);
2603 pix1 = _mm_add_epi16(pix1,
2604 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2605 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2606 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2607 pix1 = _mm_add_epi16(pix1,
2608 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2609 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2610 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2614 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2616 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2618 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2619 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2620 tci = _mm_madd_epi16(tci, tcoffset);
2621 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2622 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2623 _mm_setzero_si128());
2624 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2625 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2626 _mm_setzero_si128());
2627 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2628 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2629 tci = _mm_madd_epi16(tci, tcoffset);
2630 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2631 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2632 _mm_setzero_si128());
2633 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2634 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2635 _mm_setzero_si128());
2636 fracm = _mm_srli_epi16(subtc, 1);
2637 pix1 = _mm_add_epi16(pix1,
2638 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2639 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2640 pix3 = _mm_add_epi16(pix3,
2641 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2642 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2643 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2644 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2645 pix2 = _mm_add_epi16(pix2,
2646 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2647 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2648 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2652 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2653 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2654 tci = _mm_madd_epi16(tci, tcoffset);
2655 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2656 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2657 _mm_setzero_si128());
2658 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2659 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2660 _mm_setzero_si128());
2661 fracm = _mm_srli_epi16(subtc, 1);
2662 pix1 = _mm_add_epi16(pix1,
2663 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2664 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2665 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2666 pix1 = _mm_add_epi16(pix1,
2667 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2668 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2669 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2675 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2677 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2678 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2679 tci = _mm_madd_epi16(tci, tcoffset);
2680 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2681 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2682 _mm_setzero_si128());
2683 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2684 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2685 _mm_setzero_si128());
2686 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2687 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2688 tci = _mm_madd_epi16(tci, tcoffset);
2689 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2690 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2691 _mm_setzero_si128());
2692 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2693 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2694 _mm_setzero_si128());
2695 fracm = _mm_srli_epi16(subtc, 1);
2696 pix1 = _mm_add_epi16(pix1,
2697 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2698 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2699 pix3 = _mm_add_epi16(pix3,
2700 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2701 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2702 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2703 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2704 pix2 = _mm_add_epi16(pix2,
2705 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2706 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2707 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2711 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2712 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2713 tci = _mm_madd_epi16(tci, tcoffset);
2714 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2715 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2716 _mm_setzero_si128());
2717 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2718 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2719 _mm_setzero_si128());
2720 fracm = _mm_srli_epi16(subtc, 1);
2721 pix1 = _mm_add_epi16(pix1,
2722 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2723 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2724 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2725 pix1 = _mm_add_epi16(pix1,
2726 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2727 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2728 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2735 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2737 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2739 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2740 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2741 tci = _mm_madd_epi16(tci, tcoffset);
2742 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2743 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2747 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2748 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2749 tci = _mm_madd_epi16(tci, tcoffset);
2750 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2756 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2758 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2759 tci = _mm_and_si128(tci, tcmax);
2760 tci = _mm_madd_epi16(tci, tcoffset);
2761 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2762 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2766 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2767 tci = _mm_and_si128(tci, tcmax);
2768 tci = _mm_madd_epi16(tci, tcoffset);
2769 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2778 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2781 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2784 float DPSOFTRAST_SampleShadowmap(const float *vector)
2790 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2793 int startx = span->startx;
2794 int endx = span->endx;
2799 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2800 for (x = startx;x < endx;x++)
2803 c[0] = (data[0] + slope[0]*x) * z;
2804 c[1] = (data[1] + slope[1]*x) * z;
2805 c[2] = (data[2] + slope[2]*x) * z;
2806 c[3] = (data[3] + slope[3]*x) * z;
2807 out4f[x*4+0] = in4f[x*4+0] * c[0];
2808 out4f[x*4+1] = in4f[x*4+1] * c[1];
2809 out4f[x*4+2] = in4f[x*4+2] * c[2];
2810 out4f[x*4+3] = in4f[x*4+3] * c[3];
2814 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2817 int startx = span->startx;
2818 int endx = span->endx;
2823 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2824 for (x = startx;x < endx;x++)
2827 c[0] = (data[0] + slope[0]*x) * z;
2828 c[1] = (data[1] + slope[1]*x) * z;
2829 c[2] = (data[2] + slope[2]*x) * z;
2830 c[3] = (data[3] + slope[3]*x) * z;
2831 out4f[x*4+0] = c[0];
2832 out4f[x*4+1] = c[1];
2833 out4f[x*4+2] = c[2];
2834 out4f[x*4+3] = c[3];
2838 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2840 int x, startx = span->startx, endx = span->endx;
2841 float c[4], localcolor[4];
2842 localcolor[0] = subcolor[0];
2843 localcolor[1] = subcolor[1];
2844 localcolor[2] = subcolor[2];
2845 localcolor[3] = subcolor[3];
2846 for (x = startx;x < endx;x++)
2848 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2849 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2850 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2851 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2852 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2853 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2854 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2855 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2859 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2861 int x, startx = span->startx, endx = span->endx;
2862 for (x = startx;x < endx;x++)
2864 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2865 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2866 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2867 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2871 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2873 int x, startx = span->startx, endx = span->endx;
2874 for (x = startx;x < endx;x++)
2876 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2877 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2878 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2879 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2883 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2885 int x, startx = span->startx, endx = span->endx;
2887 for (x = startx;x < endx;x++)
2889 a = 1.0f - inb4f[x*4+3];
2891 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2892 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2893 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2894 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2898 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2900 int x, startx = span->startx, endx = span->endx;
2901 float localcolor[4], ilerp, lerp;
2902 localcolor[0] = color[0];
2903 localcolor[1] = color[1];
2904 localcolor[2] = color[2];
2905 localcolor[3] = color[3];
2906 ilerp = 1.0f - localcolor[3];
2907 lerp = localcolor[3];
2908 for (x = startx;x < endx;x++)
2910 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2911 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2912 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2913 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2919 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2923 int startx = span->startx;
2924 int endx = span->endx;
2927 __m128i submod, substep, endsubmod;
2928 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2929 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2930 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2931 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2932 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2933 for (x = startx; x < endx;)
2935 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2936 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2937 if (nextsub >= endx)
2939 nextsub = endsub = endx-1;
2940 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2944 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2945 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2946 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2947 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2948 substep = _mm_packs_epi32(substep, substep);
2949 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2951 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2952 pix = _mm_mulhi_epu16(pix, submod);
2953 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2957 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2958 pix = _mm_mulhi_epu16(pix, submod);
2959 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2966 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2970 int startx = span->startx;
2971 int endx = span->endx;
2974 __m128i submod, substep, endsubmod;
2975 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2976 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2977 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2978 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2979 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2980 for (x = startx; x < endx;)
2982 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2983 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2984 if (nextsub >= endx)
2986 nextsub = endsub = endx-1;
2987 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2991 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2992 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2993 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2994 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2995 substep = _mm_packs_epi32(substep, substep);
2996 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2998 __m128i pix = _mm_srai_epi16(submod, 4);
2999 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3003 __m128i pix = _mm_srai_epi16(submod, 4);
3004 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3011 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3014 int x, startx = span->startx, endx = span->endx;
3015 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3016 localcolor = _mm_packs_epi32(localcolor, localcolor);
3017 for (x = startx;x+2 <= endx;x+=2)
3019 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3020 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3021 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3022 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3026 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3027 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3028 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3029 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3034 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3037 int x, startx = span->startx, endx = span->endx;
3038 for (x = startx;x+2 <= endx;x+=2)
3040 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3041 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3042 pix1 = _mm_mulhi_epu16(pix1, pix2);
3043 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3047 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3048 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3049 pix1 = _mm_mulhi_epu16(pix1, pix2);
3050 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3055 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3058 int x, startx = span->startx, endx = span->endx;
3059 for (x = startx;x+2 <= endx;x+=2)
3061 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3062 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3063 pix1 = _mm_add_epi16(pix1, pix2);
3064 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3068 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3069 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3070 pix1 = _mm_add_epi16(pix1, pix2);
3071 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3076 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3079 int x, startx = span->startx, endx = span->endx;
3080 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3081 tint = _mm_packs_epi32(tint, tint);
3082 for (x = startx;x+2 <= endx;x+=2)
3084 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3085 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3086 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3087 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3091 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3092 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3093 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3094 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3099 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3102 int x, startx = span->startx, endx = span->endx;
3103 for (x = startx;x+2 <= endx;x+=2)
3105 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3106 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3107 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3108 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3109 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3113 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3114 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3115 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3116 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3117 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3122 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3125 int x, startx = span->startx, endx = span->endx;
3126 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3127 localcolor = _mm_packs_epi32(localcolor, localcolor);
3128 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3129 for (x = startx;x+2 <= endx;x+=2)
3131 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3132 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3133 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3137 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3138 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3139 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3146 void DPSOFTRAST_VertexShader_Generic(void)
3148 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3149 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3150 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3151 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3152 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3155 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3157 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3158 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3159 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3160 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3161 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3162 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3164 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3165 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3166 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3168 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3169 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3172 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3174 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3177 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3179 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3182 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3187 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3188 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3193 void DPSOFTRAST_VertexShader_PostProcess(void)
3195 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3196 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3197 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3200 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3202 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3203 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3204 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3205 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3206 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3207 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3208 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3210 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3211 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3213 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3214 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3216 // TODO: implement saturation
3218 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3220 // TODO: implement gammaramps
3222 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3227 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3229 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3232 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3234 // this is never called (because colormask is off when this shader is used)
3235 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3236 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3237 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3238 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3239 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3244 void DPSOFTRAST_VertexShader_FlatColor(void)
3246 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3247 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3250 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3253 unsigned char * RESTRICT pixelmask = span->pixelmask;
3254 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3255 int x, startx = span->startx, endx = span->endx;
3256 __m128i Color_Ambientm;
3257 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3258 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3259 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3260 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3261 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3262 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3263 pixel = buffer_FragColorbgra8;
3264 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3265 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3266 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3267 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3268 for (x = startx;x < endx;x++)
3271 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3274 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3275 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3276 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3277 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3283 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3284 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3285 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3287 if (pixel == buffer_FragColorbgra8)
3288 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3294 void DPSOFTRAST_VertexShader_VertexColor(void)
3296 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3297 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3298 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3301 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3304 unsigned char * RESTRICT pixelmask = span->pixelmask;
3305 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3306 int x, startx = span->startx, endx = span->endx;
3307 __m128i Color_Ambientm, Color_Diffusem;
3309 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3310 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3311 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3312 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3313 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3314 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3315 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3316 pixel = buffer_FragColorbgra8;
3317 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3318 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3319 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3320 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3321 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3322 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3323 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3324 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3325 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3326 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3327 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3328 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3329 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3330 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3332 __m128i color, mod, pix;
3333 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3336 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3337 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3338 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3339 data = _mm_add_ps(data, slope);
3340 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3341 data = _mm_add_ps(data, slope);
3342 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3343 data = _mm_add_ps(data, slope);
3344 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3345 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3346 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3347 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3348 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3349 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3355 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3356 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3357 mod = _mm_packs_epi32(mod, mod);
3358 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3359 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3361 if (pixel == buffer_FragColorbgra8)
3362 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3368 void DPSOFTRAST_VertexShader_Lightmap(void)
3370 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3371 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3372 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3375 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3378 unsigned char * RESTRICT pixelmask = span->pixelmask;
3379 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3380 int x, startx = span->startx, endx = span->endx;
3381 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3382 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3383 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3384 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3385 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3386 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3387 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3388 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3389 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3390 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3391 pixel = buffer_FragColorbgra8;
3392 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3393 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3394 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3395 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3396 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3397 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3398 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3399 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3401 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3402 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3403 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3404 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3405 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3406 for (x = startx;x < endx;x++)
3408 __m128i color, lightmap, glow, pix;
3409 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3412 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3413 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3414 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3415 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3416 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3417 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3418 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3419 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3420 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3421 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3427 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3428 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3429 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3430 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3431 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3432 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3437 for (x = startx;x < endx;x++)
3439 __m128i color, lightmap, pix;
3440 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3443 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3444 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3445 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3446 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3447 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3448 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3449 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3455 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3456 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3457 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3458 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3461 if (pixel == buffer_FragColorbgra8)
3462 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3467 void DPSOFTRAST_VertexShader_LightDirection(void);
3468 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3470 void DPSOFTRAST_VertexShader_FakeLight(void)
3472 DPSOFTRAST_VertexShader_LightDirection();
3475 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3477 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3482 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3484 DPSOFTRAST_VertexShader_LightDirection();
3485 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3488 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3490 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3495 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3497 DPSOFTRAST_VertexShader_LightDirection();
3498 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3501 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3503 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3508 void DPSOFTRAST_VertexShader_LightDirection(void)
3511 int numvertices = dpsoftrast.numvertices;
3513 float LightVector[4];
3514 float EyePosition[4];
3515 float EyeVectorModelSpace[4];
3521 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3522 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3523 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3524 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3525 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3526 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3527 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3528 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3529 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3530 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3531 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3532 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3533 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3534 for (i = 0;i < numvertices;i++)
3536 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3537 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3538 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3539 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3540 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3541 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3542 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3543 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3544 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3545 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3546 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3547 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3548 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3549 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3550 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3551 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3552 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3553 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3554 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3555 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3556 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3557 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3558 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3559 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3560 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3561 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3562 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3563 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3564 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3566 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3569 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3570 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3571 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3572 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3573 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3574 #define DPSOFTRAST_Vector3Normalize(v)\
3577 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3588 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3590 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3591 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3592 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3593 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3594 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3595 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3596 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3597 unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3598 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3599 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3600 int x, startx = span->startx, endx = span->endx;
3601 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3602 float LightVectordata[4];
3603 float LightVectorslope[4];
3604 float EyeVectordata[4];
3605 float EyeVectorslope[4];
3606 float VectorSdata[4];
3607 float VectorSslope[4];
3608 float VectorTdata[4];
3609 float VectorTslope[4];
3610 float VectorRdata[4];
3611 float VectorRslope[4];
3613 float diffusetex[4];
3615 float surfacenormal[4];
3616 float lightnormal[4];
3617 float lightnormal_modelspace[4];
3619 float specularnormal[4];
3622 float SpecularPower;
3624 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3625 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3626 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3627 Color_Glow[3] = 0.0f;
3628 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3629 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3630 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3631 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3632 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3633 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3634 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3635 Color_Pants[3] = 0.0f;
3636 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3637 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3638 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3639 Color_Shirt[3] = 0.0f;
3640 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3641 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3642 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3644 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3645 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3647 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3649 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3651 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3653 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3654 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3655 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3656 Color_Diffuse[3] = 0.0f;
3657 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3658 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3659 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3660 LightColor[3] = 0.0f;
3661 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3662 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3663 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3664 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3665 Color_Specular[3] = 0.0f;
3666 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3667 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3668 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3670 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3672 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3673 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3674 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3675 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3676 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3678 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3680 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3681 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3683 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3685 // nothing of this needed
3689 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3692 for (x = startx;x < endx;x++)
3695 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3696 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3697 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3698 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3699 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3701 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3702 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3703 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3704 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3706 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3707 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3708 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3709 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3710 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3711 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3712 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3713 DPSOFTRAST_Vector3Normalize(surfacenormal);
3715 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3717 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3718 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3719 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3720 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3722 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3723 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3724 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3725 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3727 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3728 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3729 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3730 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3732 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3733 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3734 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3735 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3737 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3738 DPSOFTRAST_Vector3Normalize(lightnormal);
3740 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3742 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3743 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3744 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3745 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3748 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3750 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3751 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3752 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3754 float f = 1.0f / 256.0f;
3755 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3756 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3757 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3760 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3762 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3763 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3764 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3765 DPSOFTRAST_Vector3Normalize(lightnormal);
3767 LightColor[0] = 1.0;
3768 LightColor[1] = 1.0;
3769 LightColor[2] = 1.0;
3773 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3774 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3775 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3776 DPSOFTRAST_Vector3Normalize(lightnormal);
3779 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3781 if(thread->shader_exactspecularmath)
3783 // reflect lightnormal at surfacenormal, take the negative of that
3784 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3786 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3787 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3788 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3789 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3791 // dot of this and normalize(EyeVectorFogDepth.xyz)
3792 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3793 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3794 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3795 DPSOFTRAST_Vector3Normalize(eyenormal);
3797 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3801 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3802 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3803 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3804 DPSOFTRAST_Vector3Normalize(eyenormal);
3806 specularnormal[0] = lightnormal[0] + eyenormal[0];
3807 specularnormal[1] = lightnormal[1] + eyenormal[1];
3808 specularnormal[2] = lightnormal[2] + eyenormal[2];
3809 DPSOFTRAST_Vector3Normalize(specularnormal);
3811 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3814 specular = pow(specular, SpecularPower * glosstex[3]);
3815 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3817 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3818 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3819 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3820 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3824 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3825 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3826 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3827 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3830 buffer_FragColorbgra8[x*4+0] = d[0];
3831 buffer_FragColorbgra8[x*4+1] = d[1];
3832 buffer_FragColorbgra8[x*4+2] = d[2];
3833 buffer_FragColorbgra8[x*4+3] = d[3];
3836 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3838 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3839 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3840 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3841 Color_Diffuse[3] = 0.0f;
3842 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3843 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3844 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3845 LightColor[3] = 0.0f;
3846 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3848 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3850 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3851 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3852 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3853 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3854 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3856 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3858 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3859 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3861 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3863 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3867 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3870 for (x = startx;x < endx;x++)
3873 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3874 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3875 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3876 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3877 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3878 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3879 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3880 DPSOFTRAST_Vector3Normalize(surfacenormal);
3882 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3884 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3885 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3886 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3887 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3889 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3890 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3891 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3892 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3894 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3895 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3896 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3897 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3899 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3900 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3901 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3902 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3904 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3905 DPSOFTRAST_Vector3Normalize(lightnormal);
3907 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3909 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3910 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3911 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3912 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3915 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3917 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3918 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3919 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3921 float f = 1.0f / 256.0f;
3922 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3923 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3924 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3927 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3929 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3930 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3931 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3932 DPSOFTRAST_Vector3Normalize(lightnormal);
3934 LightColor[0] = 1.0;
3935 LightColor[1] = 1.0;
3936 LightColor[2] = 1.0;
3940 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3941 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3942 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3943 DPSOFTRAST_Vector3Normalize(lightnormal);
3946 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3947 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3949 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3950 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3951 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3952 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3956 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3957 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3958 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3959 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3961 buffer_FragColorbgra8[x*4+0] = d[0];
3962 buffer_FragColorbgra8[x*4+1] = d[1];
3963 buffer_FragColorbgra8[x*4+2] = d[2];
3964 buffer_FragColorbgra8[x*4+3] = d[3];
3969 for (x = startx;x < endx;x++)
3972 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3973 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3974 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3975 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3977 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3979 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3980 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3981 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3982 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3986 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3987 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3988 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3989 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3991 buffer_FragColorbgra8[x*4+0] = d[0];
3992 buffer_FragColorbgra8[x*4+1] = d[1];
3993 buffer_FragColorbgra8[x*4+2] = d[2];
3994 buffer_FragColorbgra8[x*4+3] = d[3];
3997 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4002 void DPSOFTRAST_VertexShader_LightSource(void)
4005 int numvertices = dpsoftrast.numvertices;
4006 float LightPosition[4];
4007 float LightVector[4];
4008 float LightVectorModelSpace[4];
4009 float EyePosition[4];
4010 float EyeVectorModelSpace[4];
4016 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4017 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4018 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4019 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4020 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4021 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4022 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4023 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4024 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4025 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4026 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4027 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4028 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4029 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4030 for (i = 0;i < numvertices;i++)
4032 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4033 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4034 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4035 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4036 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4037 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4038 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4039 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4040 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4041 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4042 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4043 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4044 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4045 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4046 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4047 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4048 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4049 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
4050 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4051 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4052 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4053 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4054 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4055 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4056 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4057 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4058 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4059 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4060 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4061 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4062 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4063 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4065 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4066 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4069 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4072 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4073 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4074 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4075 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4076 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4077 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4078 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4079 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4080 int x, startx = span->startx, endx = span->endx;
4081 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4082 float CubeVectordata[4];
4083 float CubeVectorslope[4];
4084 float LightVectordata[4];
4085 float LightVectorslope[4];
4086 float EyeVectordata[4];
4087 float EyeVectorslope[4];
4089 float diffusetex[4];
4091 float surfacenormal[4];
4092 float lightnormal[4];
4094 float specularnormal[4];
4097 float SpecularPower;
4098 float CubeVector[4];
4101 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4102 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4103 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4104 Color_Glow[3] = 0.0f;
4105 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4106 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4107 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4108 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4109 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4110 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4111 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4112 Color_Diffuse[3] = 0.0f;
4113 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4114 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4115 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4116 Color_Specular[3] = 0.0f;
4117 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4118 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4119 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4120 Color_Pants[3] = 0.0f;
4121 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4122 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4123 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4124 Color_Shirt[3] = 0.0f;
4125 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4126 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4127 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4128 LightColor[3] = 0.0f;
4129 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4130 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4131 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4132 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4133 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4134 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4135 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4136 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4138 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4139 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4141 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4142 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4143 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4145 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4146 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4147 for (x = startx;x < endx;x++)
4150 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4151 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4152 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4153 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4154 if (attenuation < 0.01f)
4156 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4158 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4159 if (attenuation < 0.01f)
4163 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4164 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4165 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4166 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4167 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4169 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4170 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4171 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4172 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4174 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4175 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4176 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4177 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4178 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4179 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4180 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4181 DPSOFTRAST_Vector3Normalize(surfacenormal);
4183 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4184 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4185 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4186 DPSOFTRAST_Vector3Normalize(lightnormal);
4188 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4190 if(thread->shader_exactspecularmath)
4192 // reflect lightnormal at surfacenormal, take the negative of that
4193 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4195 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4196 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4197 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4198 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4200 // dot of this and normalize(EyeVectorFogDepth.xyz)
4201 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4202 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4203 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4204 DPSOFTRAST_Vector3Normalize(eyenormal);
4206 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4210 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4211 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4212 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4213 DPSOFTRAST_Vector3Normalize(eyenormal);
4215 specularnormal[0] = lightnormal[0] + eyenormal[0];
4216 specularnormal[1] = lightnormal[1] + eyenormal[1];
4217 specularnormal[2] = lightnormal[2] + eyenormal[2];
4218 DPSOFTRAST_Vector3Normalize(specularnormal);
4220 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4222 specular = pow(specular, SpecularPower * glosstex[3]);
4224 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4226 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4227 attenuation *= (1.0f / 255.0f);
4228 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4229 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4230 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4231 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4235 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4236 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4237 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4238 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4240 buffer_FragColorbgra8[x*4+0] = d[0];
4241 buffer_FragColorbgra8[x*4+1] = d[1];
4242 buffer_FragColorbgra8[x*4+2] = d[2];
4243 buffer_FragColorbgra8[x*4+3] = d[3];
4246 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4248 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4249 for (x = startx;x < endx;x++)
4252 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4253 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4254 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4255 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4256 if (attenuation < 0.01f)
4258 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4260 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4261 if (attenuation < 0.01f)
4265 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4266 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4267 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4268 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4269 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4271 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4272 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4273 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4274 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4276 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4277 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4278 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4279 DPSOFTRAST_Vector3Normalize(surfacenormal);
4281 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4282 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4283 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4284 DPSOFTRAST_Vector3Normalize(lightnormal);
4286 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4287 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4289 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4290 attenuation *= (1.0f / 255.0f);
4291 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4292 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4293 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4294 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4298 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4299 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4300 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4301 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4303 buffer_FragColorbgra8[x*4+0] = d[0];
4304 buffer_FragColorbgra8[x*4+1] = d[1];
4305 buffer_FragColorbgra8[x*4+2] = d[2];
4306 buffer_FragColorbgra8[x*4+3] = d[3];
4311 for (x = startx;x < endx;x++)
4314 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4315 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4316 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4317 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4318 if (attenuation < 0.01f)
4320 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4322 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4323 if (attenuation < 0.01f)
4327 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4328 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4329 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4330 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4331 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4333 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4334 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4335 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4336 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4338 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4340 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4341 attenuation *= (1.0f / 255.0f);
4342 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4343 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4344 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4345 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4349 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4350 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4351 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4352 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4354 buffer_FragColorbgra8[x*4+0] = d[0];
4355 buffer_FragColorbgra8[x*4+1] = d[1];
4356 buffer_FragColorbgra8[x*4+2] = d[2];
4357 buffer_FragColorbgra8[x*4+3] = d[3];
4360 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4366 void DPSOFTRAST_VertexShader_Refraction(void)
4368 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4369 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4370 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4373 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4375 // DIRTY TRICK: only do sideways displacement. Not correct, but cheaper and thus better for SW.
4377 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4379 int x, startx = span->startx, endx = span->endx;
4382 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4383 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4386 float ModelViewProjectionPositiondata[4];
4387 float ModelViewProjectionPositionslope[4];
4390 float ScreenScaleRefractReflect[2];
4391 float ScreenCenterRefractReflect[2];
4392 float DistortScaleRefractReflect[2];
4393 float RefractColor[4];
4395 const unsigned char * RESTRICT pixelbase;
4396 const unsigned char * RESTRICT pixel[4];
4397 DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4398 if(!texture) return;
4399 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[0][0];
4402 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4403 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4406 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
4409 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4410 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4411 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4412 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4413 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4414 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4415 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4416 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4417 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4418 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4421 for (x = startx;x < endx;x++)
4423 float SafeScreenTexCoord[2];
4424 float ScreenTexCoord[2];
4431 // " vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4432 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4434 // " vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4435 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4436 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4438 // " vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4439 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4440 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4441 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4442 DPSOFTRAST_Vector3Normalize(v);
4443 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4444 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4446 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4447 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4449 unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<12) - 2048, ScreenTexCoord[1] * (texture->mipmap[0][3]<<12) - 2048};
4450 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
4451 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
4452 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
4453 int tci[2] = { tc[0]>>12, tc[1]>>12 };
4454 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4455 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4456 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4457 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4458 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4459 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4460 pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci1[0]);
4461 pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci[0]);
4462 pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci1[0]);
4463 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
4464 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
4465 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
4469 int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2], ScreenTexCoord[1] * texture->mipmap[0][3] };
4470 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4471 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4472 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4478 //p = (int) bound(startx, x + (ScreenTexCoord[0] - SafeScreenTexCoord[0]) / (ModelViewProjectionPositionslope[0]*z), endx-1);
4479 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4480 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4481 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4482 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4485 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4490 void DPSOFTRAST_VertexShader_Water(void)
4492 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4496 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4499 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4500 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4501 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4502 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4503 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4508 void DPSOFTRAST_VertexShader_ShowDepth(void)
4510 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4513 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4516 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4517 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4518 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4519 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4520 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4525 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4527 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4530 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4533 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4534 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4535 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4536 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4537 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4542 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4544 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4547 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4550 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4551 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4552 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4553 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4554 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4559 typedef struct DPSOFTRAST_ShaderModeInfo_s
4562 void (*Vertex)(void);
4563 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4564 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4565 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4567 DPSOFTRAST_ShaderModeInfo;
4569 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4571 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4572 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4573 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4574 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4575 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4576 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4577 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4578 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4579 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4580 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4581 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4582 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4583 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4584 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4585 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4586 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4589 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4594 unsigned int *depthpixel;
4598 unsigned char *pixelmask;
4599 DPSOFTRAST_State_Triangle *triangle;
4600 triangle = &thread->triangles[span->triangle];
4601 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4602 startx = span->startx;
4604 depth = span->depthbase;
4605 depthslope = span->depthslope;
4606 pixelmask = thread->pixelmaskarray;
4607 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4609 switch(thread->fb_depthfunc)
4612 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4613 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4614 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4615 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4616 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4617 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4618 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4620 while (startx < endx && !pixelmask[startx])
4622 while (endx > startx && !pixelmask[endx-1])
4627 // no depth testing means we're just dealing with color...
4628 memset(pixelmask + startx, 1, endx - startx);
4630 span->pixelmask = pixelmask;
4631 span->startx = startx;
4635 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4637 int x, d, depth, depthslope, startx, endx;
4638 const unsigned char *pixelmask;
4639 unsigned int *depthpixel;
4640 if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4642 depth = span->depthbase;
4643 depthslope = span->depthslope;
4644 pixelmask = span->pixelmask;
4645 startx = span->startx;
4647 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4648 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4654 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4657 DPSOFTRAST_State_Triangle *triangle;
4658 DPSOFTRAST_State_Span *span;
4659 for (i = 0; i < thread->numspans; i++)
4661 span = &thread->spans[i];
4662 triangle = &thread->triangles[span->triangle];
4663 DPSOFTRAST_Draw_DepthTest(thread, span);
4664 if (span->startx >= span->endx)
4666 // run pixel shader if appropriate
4667 // do this before running depthmask code, to allow the pixelshader
4668 // to clear pixelmask values for alpha testing
4669 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4670 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4671 DPSOFTRAST_Draw_DepthWrite(thread, span);
4673 thread->numspans = 0;
4676 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4678 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4681 int cullface = thread->cullface;
4682 int minx, maxx, miny, maxy;
4683 int miny1, maxy1, miny2, maxy2;
4684 __m128i fbmin, fbmax;
4685 __m128 viewportcenter, viewportscale;
4686 int firstvertex = command->firstvertex;
4687 int numvertices = command->numvertices;
4688 int numtriangles = command->numtriangles;
4689 const int *element3i = command->element3i;
4690 const unsigned short *element3s = command->element3s;
4691 int clipped = command->clipped;
4698 int starty, endy, bandy;
4702 float clip0origin, clip0slope;
4704 __m128 triangleedge1, triangleedge2, trianglenormal;
4707 DPSOFTRAST_State_Triangle *triangle;
4708 DPSOFTRAST_Texture *texture;
4709 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4710 miny = thread->fb_scissor[1];
4711 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4712 miny1 = bound(miny, thread->miny1, maxy);
4713 maxy1 = bound(miny, thread->maxy1, maxy);
4714 miny2 = bound(miny, thread->miny2, maxy);
4715 maxy2 = bound(miny, thread->maxy2, maxy);
4716 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4718 if (!ATOMIC_DECREMENT(command->refcount))
4720 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4721 MM_FREE(command->arrays);
4725 minx = thread->fb_scissor[0];
4726 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4727 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4728 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4729 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4730 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4731 screen[3] = _mm_setzero_ps();
4732 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4733 for (i = 0;i < numtriangles;i++)
4735 const float *screencoord4f = command->arrays;
4736 const float *arrays = screencoord4f + numvertices*4;
4738 // generate the 3 edges of this triangle
4739 // generate spans for the triangle - switch based on left split or right split classification of triangle
4742 e[0] = element3s[i*3+0] - firstvertex;
4743 e[1] = element3s[i*3+1] - firstvertex;
4744 e[2] = element3s[i*3+2] - firstvertex;
4748 e[0] = element3i[i*3+0] - firstvertex;
4749 e[1] = element3i[i*3+1] - firstvertex;
4750 e[2] = element3i[i*3+2] - firstvertex;
4759 #define SKIPBACKFACE \
4760 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4761 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4762 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4763 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4764 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4768 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4772 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4777 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4778 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4780 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4781 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4783 #define CLIPPEDVERTEXCOPY(k,p1) \
4784 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4786 #define GENATTRIBCOPY(attrib, p1) \
4787 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4788 #define GENATTRIBLERP(attrib, p1, p2) \
4790 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4791 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4793 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4797 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4798 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4799 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4800 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4801 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4802 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4803 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4809 // calculate distance from nearplane
4810 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4811 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4812 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4813 if (clipdist[0] >= 0.0f)
4815 if (clipdist[1] >= 0.0f)
4817 if (clipdist[2] >= 0.0f)
4820 // triangle is entirely in front of nearplane
4821 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4828 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4836 if (clipdist[2] >= 0.0f)
4838 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4845 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4852 else if (clipdist[1] >= 0.0f)
4854 if (clipdist[2] >= 0.0f)
4856 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4863 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4869 else if (clipdist[2] >= 0.0f)
4871 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4876 else continue; // triangle is entirely behind nearplane
4879 // calculate integer y coords for triangle points
4880 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4881 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4882 screenmin = _mm_min_epi16(screeni, screenir),
4883 screenmax = _mm_max_epi16(screeni, screenir);
4884 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4885 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4886 screenmin = _mm_max_epi16(screenmin, fbmin);
4887 screenmax = _mm_min_epi16(screenmax, fbmax);
4888 // skip offscreen triangles
4889 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4891 starty = _mm_extract_epi16(screenmin, 1);
4892 endy = _mm_extract_epi16(screenmax, 1)+1;
4893 if (starty >= maxy1 && endy <= miny2)
4895 screeny = _mm_srai_epi32(screeni, 16);
4898 triangle = &thread->triangles[thread->numtriangles];
4900 // calculate attribute plans for triangle data...
4901 // okay, this triangle is going to produce spans, we'd better project
4902 // the interpolants now (this is what gives perspective texturing),
4903 // this consists of simply multiplying all arrays by the W coord
4904 // (which is basically 1/Z), which will be undone per-pixel
4905 // (multiplying by Z again) to get the perspective-correct array
4908 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4909 __m128 mipedgescale, mipdensity;
4910 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4911 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4912 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4913 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4914 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4915 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4916 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4917 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4918 attribedge1 = _mm_sub_ss(w0, w1);
4919 attribedge2 = _mm_sub_ss(w2, w1);
4920 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4921 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4922 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4923 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4924 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4925 _mm_store_ss(&triangle->w[0], attribxslope);
4926 _mm_store_ss(&triangle->w[1], attribyslope);
4927 _mm_store_ss(&triangle->w[2], attriborigin);
4932 if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
4934 float cliporigin, clipxslope, clipyslope;
4935 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
4936 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
4937 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
4938 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4939 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4940 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4941 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
4942 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
4943 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
4946 clip0origin = -cliporigin/clipxslope;
4947 clip0slope = -clipyslope/clipxslope;
4948 clip0dir = clipxslope > 0 ? 1 : -1;
4950 else if(clipyslope > 0)
4952 clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
4953 clip0slope = dpsoftrast.fb_width;
4956 else if(clipyslope < 0)
4958 clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
4959 clip0slope = -dpsoftrast.fb_width;
4962 else if(clip0origin < 0) continue;
4965 mipedgescale = _mm_setzero_ps();
4966 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4968 __m128 attrib0, attrib1, attrib2;
4969 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4970 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4972 arrays += numvertices*4;
4973 GENATTRIBS(attrib0, attrib1, attrib2);
4974 attriborigin = _mm_mul_ps(attrib1, w1);
4975 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4976 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4977 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4978 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4979 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4980 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
4981 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
4982 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
4983 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4985 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4986 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4987 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4988 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4992 memset(triangle->mip, 0, sizeof(triangle->mip));
4993 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4995 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4996 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4998 texture = thread->texbound[texunit];
4999 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5001 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5002 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5003 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5004 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5005 // this will be multiplied in the texturing routine by the texture resolution
5006 y = _mm_cvtss_si32(mipdensity);
5009 y = (int)(log((float)y)*0.5f/M_LN2);
5010 if (y > texture->mipmaps - 1)
5011 y = texture->mipmaps - 1;
5012 triangle->mip[texunit] = y;
5018 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5021 __m128 xcoords, xslope;
5022 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5023 int yccmask = _mm_movemask_epi8(ycc);
5024 int edge0p, edge0n, edge1p, edge1n;
5033 case 0xFFFF: /*0000*/ y = endy; continue;
5034 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5035 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5036 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5037 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5038 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5039 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5040 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5041 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5042 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5043 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5044 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5045 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5046 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5047 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5048 case 0x0000: /*1111*/ y++; continue;
5056 case 0xFFFF: /*000*/ y = endy; continue;
5057 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5058 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5059 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5060 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5061 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5062 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5063 case 0x0000: /*111*/ y++; continue;
5066 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5067 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5068 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5069 nexty = _mm_extract_epi16(ycc, 0);
5070 if (nexty >= bandy) nexty = bandy-1;
5071 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5072 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5073 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5074 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5075 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5076 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5078 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5079 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5081 clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5082 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5084 int startx, endx, offset;
5085 startx = _mm_cvtss_si32(xcoords);
5086 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5087 if (startx < minx) startx = minx;
5088 if (endx > maxx) endx = maxx;
5089 if (startx >= endx) continue;
5097 if(endx <= clip0) continue;
5098 startx = (int)clip0;
5101 else if (endx > clip0)
5103 if(startx >= clip0) continue;
5108 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5110 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5111 span->triangle = thread->numtriangles;
5115 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5116 if (span->startx >= span->endx)
5118 wslope = triangle->w[0];
5119 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5120 span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5121 span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5122 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5123 DPSOFTRAST_Draw_ProcessSpans(thread);
5128 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5130 DPSOFTRAST_Draw_ProcessSpans(thread);
5131 thread->numtriangles = 0;
5135 if (!ATOMIC_DECREMENT(command->refcount))
5137 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5138 MM_FREE(command->arrays);
5141 if (thread->numspans > 0 || thread->numtriangles > 0)
5143 DPSOFTRAST_Draw_ProcessSpans(thread);
5144 thread->numtriangles = 0;
5149 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5153 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5154 int datasize = 2*numvertices*sizeof(float[4]);
5155 DPSOFTRAST_Command_Draw *command;
5156 unsigned char *data;
5157 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5159 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5160 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5162 datasize += numvertices*sizeof(float[4]);
5165 datasize += numtriangles*sizeof(unsigned short[3]);
5167 datasize += numtriangles*sizeof(int[3]);
5168 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5169 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5171 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5172 data = (unsigned char *)MM_CALLOC(datasize, 1);
5176 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5177 data = (unsigned char *)command + commandsize;
5179 command->firstvertex = firstvertex;
5180 command->numvertices = numvertices;
5181 command->numtriangles = numtriangles;
5182 command->arrays = (float *)data;
5183 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5184 dpsoftrast.firstvertex = firstvertex;
5185 dpsoftrast.numvertices = numvertices;
5186 dpsoftrast.screencoord4f = (float *)data;
5187 data += numvertices*sizeof(float[4]);
5188 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5189 data += numvertices*sizeof(float[4]);
5190 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5192 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5193 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5195 dpsoftrast.post_array4f[j] = (float *)data;
5196 data += numvertices*sizeof(float[4]);
5198 command->element3i = NULL;
5199 command->element3s = NULL;
5202 command->element3s = (unsigned short *)data;
5203 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5207 command->element3i = (int *)data;
5208 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5213 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5215 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5216 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5217 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5218 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5219 if (command->starty >= command->endy)
5221 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5222 MM_FREE(command->arrays);
5223 DPSOFTRAST_UndoCommand(command->commandsize);
5226 command->clipped = dpsoftrast.drawclipped;
5227 command->refcount = dpsoftrast.numthreads;
5229 if (dpsoftrast.usethreads)
5232 DPSOFTRAST_Draw_SyncCommands();
5233 for (i = 0; i < dpsoftrast.numthreads; i++)
5235 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5236 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5237 Thread_CondSignal(thread->drawcond);
5242 DPSOFTRAST_Draw_FlushThreads();
5246 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5247 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5249 thread->validate |= DPSOFTRAST_VALIDATE_FB;
5251 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5253 DPSOFTRAST_Command_SetRenderTargets *command;
5254 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5255 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5256 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5258 dpsoftrast.fb_width = width;
5259 dpsoftrast.fb_height = height;
5260 dpsoftrast.fb_depthpixels = depthpixels;
5261 dpsoftrast.fb_colorpixels[0] = colorpixels0;
5262 dpsoftrast.fb_colorpixels[1] = colorpixels1;
5263 dpsoftrast.fb_colorpixels[2] = colorpixels2;
5264 dpsoftrast.fb_colorpixels[3] = colorpixels3;
5265 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5266 command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5267 command->width = width;
5268 command->height = height;
5271 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5273 int commandoffset = thread->commandoffset;
5274 while (commandoffset != endoffset)
5276 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5277 switch (command->opcode)
5279 #define INTERPCOMMAND(name) \
5280 case DPSOFTRAST_OPCODE_##name : \
5281 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5282 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5283 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5284 commandoffset = 0; \
5286 INTERPCOMMAND(Viewport)
5287 INTERPCOMMAND(ClearColor)
5288 INTERPCOMMAND(ClearDepth)
5289 INTERPCOMMAND(ColorMask)
5290 INTERPCOMMAND(DepthTest)
5291 INTERPCOMMAND(ScissorTest)
5292 INTERPCOMMAND(Scissor)
5293 INTERPCOMMAND(BlendFunc)
5294 INTERPCOMMAND(BlendSubtract)
5295 INTERPCOMMAND(DepthMask)
5296 INTERPCOMMAND(DepthFunc)
5297 INTERPCOMMAND(DepthRange)
5298 INTERPCOMMAND(PolygonOffset)
5299 INTERPCOMMAND(CullFace)
5300 INTERPCOMMAND(AlphaTest)
5301 INTERPCOMMAND(AlphaFunc)
5302 INTERPCOMMAND(SetTexture)
5303 INTERPCOMMAND(SetShader)
5304 INTERPCOMMAND(Uniform4f)
5305 INTERPCOMMAND(UniformMatrix4f)
5306 INTERPCOMMAND(Uniform1i)
5307 INTERPCOMMAND(SetRenderTargets)
5308 INTERPCOMMAND(ClipPlane)
5310 case DPSOFTRAST_OPCODE_Draw:
5311 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5312 commandoffset += command->commandsize;
5313 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5315 thread->commandoffset = commandoffset;
5318 case DPSOFTRAST_OPCODE_Reset:
5323 thread->commandoffset = commandoffset;
5326 static int DPSOFTRAST_Draw_Thread(void *data)
5328 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5329 while(thread->index >= 0)
5331 if (thread->commandoffset != dpsoftrast.drawcommand)
5333 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5337 Thread_LockMutex(thread->drawmutex);
5338 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5340 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5341 thread->starving = true;
5342 Thread_CondWait(thread->drawcond, thread->drawmutex);
5343 thread->starving = false;
5345 Thread_UnlockMutex(thread->drawmutex);
5351 static void DPSOFTRAST_Draw_FlushThreads(void)
5353 DPSOFTRAST_State_Thread *thread;
5355 DPSOFTRAST_Draw_SyncCommands();
5356 if (dpsoftrast.usethreads)
5358 for (i = 0; i < dpsoftrast.numthreads; i++)
5360 thread = &dpsoftrast.threads[i];
5361 if (thread->commandoffset != dpsoftrast.drawcommand)
5363 Thread_LockMutex(thread->drawmutex);
5364 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5365 Thread_CondSignal(thread->drawcond);
5366 Thread_UnlockMutex(thread->drawmutex);
5369 for (i = 0; i < dpsoftrast.numthreads; i++)
5371 thread = &dpsoftrast.threads[i];
5372 if (thread->commandoffset != dpsoftrast.drawcommand)
5374 Thread_LockMutex(thread->drawmutex);
5375 if (thread->commandoffset != dpsoftrast.drawcommand)
5377 thread->waiting = true;
5378 Thread_CondWait(thread->waitcond, thread->drawmutex);
5379 thread->waiting = false;
5381 Thread_UnlockMutex(thread->drawmutex);
5387 for (i = 0; i < dpsoftrast.numthreads; i++)
5389 thread = &dpsoftrast.threads[i];
5390 if (thread->commandoffset != dpsoftrast.drawcommand)
5391 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5394 dpsoftrast.commandpool.usedcommands = 0;
5397 void DPSOFTRAST_Flush(void)
5399 DPSOFTRAST_Draw_FlushThreads();
5402 void DPSOFTRAST_Finish(void)
5407 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5417 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5418 dpsoftrast.bigendian = u.b[3];
5419 dpsoftrast.fb_width = width;
5420 dpsoftrast.fb_height = height;
5421 dpsoftrast.fb_depthpixels = depthpixels;
5422 dpsoftrast.fb_colorpixels[0] = colorpixels;
5423 dpsoftrast.fb_colorpixels[1] = NULL;
5424 dpsoftrast.fb_colorpixels[1] = NULL;
5425 dpsoftrast.fb_colorpixels[1] = NULL;
5426 dpsoftrast.viewport[0] = 0;
5427 dpsoftrast.viewport[1] = 0;
5428 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5429 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5430 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5431 dpsoftrast.texture_firstfree = 1;
5432 dpsoftrast.texture_end = 1;
5433 dpsoftrast.texture_max = 0;
5434 dpsoftrast.color[0] = 1;
5435 dpsoftrast.color[1] = 1;
5436 dpsoftrast.color[2] = 1;
5437 dpsoftrast.color[3] = 1;
5438 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5439 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5440 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5441 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5442 for (i = 0; i < dpsoftrast.numthreads; i++)
5444 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5446 thread->cullface = GL_BACK;
5447 thread->colormask[0] = 1;
5448 thread->colormask[1] = 1;
5449 thread->colormask[2] = 1;
5450 thread->colormask[3] = 1;
5451 thread->blendfunc[0] = GL_ONE;
5452 thread->blendfunc[1] = GL_ZERO;
5453 thread->depthmask = true;
5454 thread->depthtest = true;
5455 thread->depthfunc = GL_LEQUAL;
5456 thread->scissortest = false;
5457 thread->alphatest = false;
5458 thread->alphafunc = GL_GREATER;
5459 thread->alphavalue = 0.5f;
5460 thread->viewport[0] = 0;
5461 thread->viewport[1] = 0;
5462 thread->viewport[2] = dpsoftrast.fb_width;
5463 thread->viewport[3] = dpsoftrast.fb_height;
5464 thread->scissor[0] = 0;
5465 thread->scissor[1] = 0;
5466 thread->scissor[2] = dpsoftrast.fb_width;
5467 thread->scissor[3] = dpsoftrast.fb_height;
5468 thread->depthrange[0] = 0;
5469 thread->depthrange[1] = 1;
5470 thread->polygonoffset[0] = 0;
5471 thread->polygonoffset[1] = 0;
5472 thread->clipplane[0] = 0;
5473 thread->clipplane[1] = 0;
5474 thread->clipplane[2] = 0;
5475 thread->clipplane[3] = 1;
5477 thread->numspans = 0;
5478 thread->numtriangles = 0;
5479 thread->commandoffset = 0;
5480 thread->waiting = false;
5481 thread->starving = false;
5483 thread->validate = -1;
5484 DPSOFTRAST_Validate(thread, -1);
5486 if (dpsoftrast.usethreads)
5488 thread->waitcond = Thread_CreateCond();
5489 thread->drawcond = Thread_CreateCond();
5490 thread->drawmutex = Thread_CreateMutex();
5491 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5497 void DPSOFTRAST_Shutdown(void)
5500 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5502 DPSOFTRAST_State_Thread *thread;
5503 for (i = 0; i < dpsoftrast.numthreads; i++)
5505 thread = &dpsoftrast.threads[i];
5506 Thread_LockMutex(thread->drawmutex);
5508 Thread_CondSignal(thread->drawcond);
5509 Thread_UnlockMutex(thread->drawmutex);
5510 Thread_WaitThread(thread->thread, 0);
5511 Thread_DestroyCond(thread->waitcond);
5512 Thread_DestroyCond(thread->drawcond);
5513 Thread_DestroyMutex(thread->drawmutex);
5516 for (i = 0;i < dpsoftrast.texture_end;i++)
5517 if (dpsoftrast.texture[i].bytes)
5518 MM_FREE(dpsoftrast.texture[i].bytes);
5519 if (dpsoftrast.texture)
5520 free(dpsoftrast.texture);
5521 if (dpsoftrast.threads)
5522 MM_FREE(dpsoftrast.threads);
5523 memset(&dpsoftrast, 0, sizeof(dpsoftrast));