3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 typedef qboolean bool;
14 #define ATOMIC_SIZE 32
17 #if defined(__APPLE__)
18 #include <libkern/OSAtomic.h>
19 #define ALIGN(var) var __attribute__((__aligned__(16)))
20 #define ATOMIC(var) var __attribute__((__aligned__(32)))
21 #define MEMORY_BARRIER (_mm_sfence())
22 #define ATOMIC_COUNTER volatile int32_t
23 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
24 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
25 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
26 #elif defined(__GNUC__)
27 #define ALIGN(var) var __attribute__((__aligned__(16)))
28 #define ATOMIC(var) var __attribute__((__aligned__(32)))
29 #define MEMORY_BARRIER (_mm_sfence())
30 //(__sync_synchronize())
31 #define ATOMIC_COUNTER volatile int
32 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
33 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
34 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
35 #elif defined(_MSC_VER)
36 #define ALIGN(var) __declspec(align(16)) var
37 #define ATOMIC(var) __declspec(align(32)) var
38 #define MEMORY_BARRIER (_mm_sfence())
40 #define ATOMIC_COUNTER volatile LONG
41 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
42 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
43 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
48 #define ALIGN(var) var
51 #define ATOMIC(var) var
53 #ifndef MEMORY_BARRIER
54 #define MEMORY_BARRIER ((void)0)
56 #ifndef ATOMIC_COUNTER
57 #define ATOMIC_COUNTER int
59 #ifndef ATOMIC_INCREMENT
60 #define ATOMIC_INCREMENT(counter) (++(counter))
62 #ifndef ATOMIC_DECREMENT
63 #define ATOMIC_DECREMENT(counter) (--(counter))
66 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
70 #include <emmintrin.h>
72 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
74 static void *MM_CALLOC(size_t nmemb, size_t size)
76 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
77 if (ptr != NULL) memset(ptr, 0, nmemb*size);
81 #define MM_FREE _mm_free
83 #define MM_MALLOC(size) malloc(size)
84 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
88 typedef enum DPSOFTRAST_ARRAY_e
90 DPSOFTRAST_ARRAY_POSITION,
91 DPSOFTRAST_ARRAY_COLOR,
92 DPSOFTRAST_ARRAY_TEXCOORD0,
93 DPSOFTRAST_ARRAY_TEXCOORD1,
94 DPSOFTRAST_ARRAY_TEXCOORD2,
95 DPSOFTRAST_ARRAY_TEXCOORD3,
96 DPSOFTRAST_ARRAY_TEXCOORD4,
97 DPSOFTRAST_ARRAY_TEXCOORD5,
98 DPSOFTRAST_ARRAY_TEXCOORD6,
99 DPSOFTRAST_ARRAY_TEXCOORD7,
100 DPSOFTRAST_ARRAY_TOTAL
104 typedef struct DPSOFTRAST_Texture_s
111 DPSOFTRAST_TEXTURE_FILTER filter;
114 ATOMIC_COUNTER binds;
115 unsigned char *bytes;
116 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
120 #define COMMAND_SIZE ALIGN_SIZE
121 #define COMMAND_ALIGN(var) ALIGN(var)
123 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
125 unsigned char opcode;
126 unsigned short commandsize;
130 enum { DPSOFTRAST_OPCODE_Reset = 0 };
132 #define DEFCOMMAND(opcodeval, name, fields) \
133 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
134 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
136 unsigned char opcode; \
137 unsigned short commandsize; \
139 } DPSOFTRAST_Command_##name );
141 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
142 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
144 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
148 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
150 DPSOFTRAST_State_Command_Pool);
152 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
154 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
156 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
158 DPSOFTRAST_State_Triangle);
160 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
161 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
162 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
163 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
164 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
166 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
167 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
168 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
169 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
170 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
171 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
172 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
173 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
174 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
177 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
179 typedef ALIGN(struct DPSOFTRAST_State_Span_s
181 int triangle; // triangle this span was generated by
182 int x; // framebuffer x coord
183 int y; // framebuffer y coord
184 int startx; // usable range (according to pixelmask)
185 int endx; // usable range (according to pixelmask)
186 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
188 DPSOFTRAST_State_Span);
190 #define DPSOFTRAST_DRAW_MAXSPANS 1024
191 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
193 #define DPSOFTRAST_VALIDATE_FB 1
194 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
195 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
196 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
198 typedef enum DPSOFTRAST_BLENDMODE_e
200 DPSOFTRAST_BLENDMODE_OPAQUE,
201 DPSOFTRAST_BLENDMODE_ALPHA,
202 DPSOFTRAST_BLENDMODE_ADDALPHA,
203 DPSOFTRAST_BLENDMODE_ADD,
204 DPSOFTRAST_BLENDMODE_INVMOD,
205 DPSOFTRAST_BLENDMODE_MUL,
206 DPSOFTRAST_BLENDMODE_MUL2,
207 DPSOFTRAST_BLENDMODE_SUBALPHA,
208 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
209 DPSOFTRAST_BLENDMODE_INVADD,
210 DPSOFTRAST_BLENDMODE_TOTAL
212 DPSOFTRAST_BLENDMODE;
214 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
233 float polygonoffset[2];
236 int shader_permutation;
238 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
240 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
241 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
243 // DPSOFTRAST_VALIDATE_ flags
246 // derived values (DPSOFTRAST_VALIDATE_FB)
249 ALIGN(float fb_viewportcenter[4]);
250 ALIGN(float fb_viewportscale[4]);
252 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
255 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
264 ATOMIC(volatile int commandoffset);
266 volatile bool waiting;
267 volatile bool starving;
274 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
275 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
277 DPSOFTRAST_State_Thread);
279 typedef ATOMIC(struct DPSOFTRAST_State_s
283 unsigned int *fb_depthpixels;
284 unsigned int *fb_colorpixels[4];
287 ALIGN(float fb_viewportcenter[4]);
288 ALIGN(float fb_viewportscale[4]);
291 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
292 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
294 const float *pointer_vertex3f;
295 const float *pointer_color4f;
296 const unsigned char *pointer_color4ub;
297 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
300 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
301 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
302 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
306 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
307 float *screencoord4f;
313 int shader_permutation;
317 int texture_firstfree;
318 DPSOFTRAST_Texture *texture;
323 const char *errorstring;
328 DPSOFTRAST_State_Thread *threads;
330 ATOMIC(volatile int drawcommand);
332 DPSOFTRAST_State_Command_Pool commandpool;
336 DPSOFTRAST_State dpsoftrast;
338 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
339 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
340 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
341 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
342 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
344 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
346 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
347 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
348 fb_viewportcenter[3] = 0.5f;
349 fb_viewportcenter[0] = 0.0f;
350 fb_viewportscale[1] = 0.5f * viewport[2];
351 fb_viewportscale[2] = -0.5f * viewport[3];
352 fb_viewportscale[3] = 0.5f;
353 fb_viewportscale[0] = 1.0f;
356 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
358 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
359 // and viewport projection values
362 x1 = thread->scissor[0];
363 x2 = thread->scissor[0] + thread->scissor[2];
364 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
365 y2 = dpsoftrast.fb_height - thread->scissor[1];
366 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
368 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
370 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
371 thread->fb_scissor[0] = x1;
372 thread->fb_scissor[1] = y1;
373 thread->fb_scissor[2] = x2 - x1;
374 thread->fb_scissor[3] = y2 - y1;
376 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
379 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
381 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
384 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
386 if (thread->blendsubtract)
388 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
390 #define BLENDFUNC(sfactor, dfactor, blendmode) \
391 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
392 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
393 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
398 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
400 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
401 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
402 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
403 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
404 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
405 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
406 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
407 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
408 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
409 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
410 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
415 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
417 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
419 mask &= thread->validate;
422 if (mask & DPSOFTRAST_VALIDATE_FB)
424 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
425 DPSOFTRAST_RecalcFB(thread);
427 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
429 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
430 DPSOFTRAST_RecalcDepthFunc(thread);
432 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
434 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
435 DPSOFTRAST_RecalcBlendFunc(thread);
439 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
441 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
442 return &dpsoftrast.texture[index];
446 static void DPSOFTRAST_Texture_Grow(void)
448 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
449 DPSOFTRAST_State_Thread *thread;
453 // expand texture array as needed
454 if (dpsoftrast.texture_max < 1024)
455 dpsoftrast.texture_max = 1024;
457 dpsoftrast.texture_max *= 2;
458 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
459 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
460 if (dpsoftrast.texbound[i])
461 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
462 for (j = 0; j < dpsoftrast.numthreads; j++)
464 thread = &dpsoftrast.threads[j];
465 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
466 if (thread->texbound[i])
467 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
471 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
480 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
481 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
482 DPSOFTRAST_Texture *texture;
483 if (width*height*depth < 1)
485 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
488 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
490 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
495 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
496 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
497 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
499 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
500 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
502 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
507 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
510 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
512 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
517 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
519 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
522 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
524 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
527 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
529 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
532 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
534 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
537 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
539 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
542 // find first empty slot in texture array
543 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
544 if (!dpsoftrast.texture[texnum].bytes)
546 dpsoftrast.texture_firstfree = texnum + 1;
547 if (dpsoftrast.texture_max <= texnum)
548 DPSOFTRAST_Texture_Grow();
549 if (dpsoftrast.texture_end <= texnum)
550 dpsoftrast.texture_end = texnum + 1;
551 texture = &dpsoftrast.texture[texnum];
552 memset(texture, 0, sizeof(*texture));
553 texture->flags = flags;
554 texture->width = width;
555 texture->height = height;
556 texture->depth = depth;
557 texture->sides = sides;
569 s = w * h * d * sides * 4;
570 texture->mipmap[mipmaps][0] = size;
571 texture->mipmap[mipmaps][1] = s;
572 texture->mipmap[mipmaps][2] = w;
573 texture->mipmap[mipmaps][3] = h;
574 texture->mipmap[mipmaps][4] = d;
577 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
583 texture->mipmaps = mipmaps;
584 texture->size = size;
586 // allocate the pixels now
587 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
591 void DPSOFTRAST_Texture_Free(int index)
593 DPSOFTRAST_Texture *texture;
594 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
598 MM_FREE(texture->bytes);
599 texture->bytes = NULL;
600 memset(texture, 0, sizeof(*texture));
601 // adjust the free range and used range
602 if (dpsoftrast.texture_firstfree > index)
603 dpsoftrast.texture_firstfree = index;
604 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
605 dpsoftrast.texture_end--;
607 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
609 int i, x, y, z, w, layer0, layer1, row0, row1;
610 unsigned char *o, *i0, *i1, *i2, *i3;
611 DPSOFTRAST_Texture *texture;
612 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
613 if (texture->mipmaps <= 1)
615 for (i = 1;i < texture->mipmaps;i++)
617 for (z = 0;z < texture->mipmap[i][4];z++)
621 if (layer1 >= texture->mipmap[i-1][4])
622 layer1 = texture->mipmap[i-1][4]-1;
623 for (y = 0;y < texture->mipmap[i][3];y++)
627 if (row1 >= texture->mipmap[i-1][3])
628 row1 = texture->mipmap[i-1][3]-1;
629 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
630 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
631 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
632 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
633 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
634 w = texture->mipmap[i][2];
637 if (texture->mipmap[i-1][2] > 1)
639 // average 3D texture
640 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
642 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
643 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
644 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
645 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
650 // average 3D mipmap with parent width == 1
651 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
653 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
654 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
655 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
656 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
662 if (texture->mipmap[i-1][2] > 1)
664 // average 2D texture (common case)
665 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
667 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
668 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
669 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
670 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
675 // 2D texture with parent width == 1
676 o[0] = (i0[0] + i1[0] + 1) >> 1;
677 o[1] = (i0[1] + i1[1] + 1) >> 1;
678 o[2] = (i0[2] + i1[2] + 1) >> 1;
679 o[3] = (i0[3] + i1[3] + 1) >> 1;
686 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
688 DPSOFTRAST_Texture *texture;
690 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
693 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
694 while (blockheight > 0)
696 memcpy(dst, pixels, blockwidth * 4);
697 pixels += blockwidth * 4;
698 dst += texture->mipmap[0][2] * 4;
701 DPSOFTRAST_Texture_CalculateMipmaps(index);
703 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
705 DPSOFTRAST_Texture *texture;
706 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
709 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
710 DPSOFTRAST_Texture_CalculateMipmaps(index);
712 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
714 DPSOFTRAST_Texture *texture;
715 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
716 return texture->mipmap[mip][2];
718 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
720 DPSOFTRAST_Texture *texture;
721 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
722 return texture->mipmap[mip][3];
724 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
726 DPSOFTRAST_Texture *texture;
727 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
728 return texture->mipmap[mip][4];
730 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
732 DPSOFTRAST_Texture *texture;
733 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
736 return texture->bytes + texture->mipmap[mip][0];
738 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
740 DPSOFTRAST_Texture *texture;
741 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
742 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
744 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
749 texture->filter = filter;
752 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
754 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
755 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
756 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
758 dpsoftrast.fb_width = width;
759 dpsoftrast.fb_height = height;
760 dpsoftrast.fb_depthpixels = depthpixels;
761 dpsoftrast.fb_colorpixels[0] = colorpixels0;
762 dpsoftrast.fb_colorpixels[1] = colorpixels1;
763 dpsoftrast.fb_colorpixels[2] = colorpixels2;
764 dpsoftrast.fb_colorpixels[3] = colorpixels3;
767 static void DPSOFTRAST_Draw_FlushThreads(void);
769 static void DPSOFTRAST_Draw_SyncCommands(void)
771 if(dpsoftrast.usethreads) MEMORY_BARRIER;
772 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
775 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
777 DPSOFTRAST_State_Thread *thread;
779 int freecommand = dpsoftrast.commandpool.freecommand;
780 int usedcommands = dpsoftrast.commandpool.usedcommands;
781 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
783 DPSOFTRAST_Draw_SyncCommands();
789 for (i = 0; i < dpsoftrast.numthreads; i++)
791 thread = &dpsoftrast.threads[i];
792 commandoffset = freecommand - thread->commandoffset;
793 if (commandoffset < 0)
794 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
795 if (commandoffset > usedcommands)
798 usedcommands = commandoffset;
801 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
803 thread = &dpsoftrast.threads[waitindex];
804 Thread_LockMutex(thread->drawmutex);
805 if (thread->commandoffset != dpsoftrast.drawcommand)
807 thread->waiting = true;
808 if (thread->starving) Thread_CondSignal(thread->drawcond);
809 Thread_CondWait(thread->waitcond, thread->drawmutex);
810 thread->waiting = false;
812 Thread_UnlockMutex(thread->drawmutex);
814 dpsoftrast.commandpool.usedcommands = usedcommands;
817 #define DPSOFTRAST_ALIGNCOMMAND(size) \
818 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
819 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
820 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
822 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
824 DPSOFTRAST_Command *command;
825 int freecommand = dpsoftrast.commandpool.freecommand;
826 int usedcommands = dpsoftrast.commandpool.usedcommands;
827 int extra = sizeof(DPSOFTRAST_Command);
828 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
829 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
830 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
832 if (dpsoftrast.usethreads)
833 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
835 DPSOFTRAST_Draw_FlushThreads();
836 freecommand = dpsoftrast.commandpool.freecommand;
837 usedcommands = dpsoftrast.commandpool.usedcommands;
839 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
841 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
842 command->opcode = DPSOFTRAST_OPCODE_Reset;
843 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
846 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
847 command->opcode = opcode;
848 command->commandsize = size;
850 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
852 dpsoftrast.commandpool.freecommand = freecommand;
853 dpsoftrast.commandpool.usedcommands = usedcommands + size;
857 static void DPSOFTRAST_UndoCommand(int size)
859 int freecommand = dpsoftrast.commandpool.freecommand;
860 int usedcommands = dpsoftrast.commandpool.usedcommands;
863 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
864 usedcommands -= size;
865 dpsoftrast.commandpool.freecommand = freecommand;
866 dpsoftrast.commandpool.usedcommands = usedcommands;
869 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
870 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
872 thread->viewport[0] = command->x;
873 thread->viewport[1] = command->y;
874 thread->viewport[2] = command->width;
875 thread->viewport[3] = command->height;
876 thread->validate |= DPSOFTRAST_VALIDATE_FB;
878 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
880 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
883 command->width = width;
884 command->height = height;
886 dpsoftrast.viewport[0] = x;
887 dpsoftrast.viewport[1] = y;
888 dpsoftrast.viewport[2] = width;
889 dpsoftrast.viewport[3] = height;
890 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
893 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
894 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
896 int i, x1, y1, x2, y2, w, h, x, y;
897 int miny1 = thread->miny1;
898 int maxy1 = thread->maxy1;
899 int miny2 = thread->miny2;
900 int maxy2 = thread->maxy2;
904 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
905 x1 = thread->fb_scissor[0];
906 y1 = thread->fb_scissor[1];
907 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
908 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
909 if (y1 < miny1) y1 = miny1;
910 if (y2 > maxy2) y2 = maxy2;
915 // FIXME: honor fb_colormask?
916 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
917 for (i = 0;i < 4;i++)
919 if (!dpsoftrast.fb_colorpixels[i])
921 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
924 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
925 for (x = x1;x < x2;x++)
930 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
932 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
939 DEFCOMMAND(3, ClearDepth, float depth;)
940 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
942 int x1, y1, x2, y2, w, h, x, y;
943 int miny1 = thread->miny1;
944 int maxy1 = thread->maxy1;
945 int miny2 = thread->miny2;
946 int maxy2 = thread->maxy2;
950 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
951 x1 = thread->fb_scissor[0];
952 y1 = thread->fb_scissor[1];
953 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
954 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
955 if (y1 < miny1) y1 = miny1;
956 if (y2 > maxy2) y2 = maxy2;
961 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
962 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
965 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
966 for (x = x1;x < x2;x++)
970 void DPSOFTRAST_ClearDepth(float d)
972 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
976 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
977 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
979 thread->colormask[0] = command->r != 0;
980 thread->colormask[1] = command->g != 0;
981 thread->colormask[2] = command->b != 0;
982 thread->colormask[3] = command->a != 0;
983 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
985 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
987 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
994 DEFCOMMAND(5, DepthTest, int enable;)
995 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
997 thread->depthtest = command->enable;
998 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1000 void DPSOFTRAST_DepthTest(int enable)
1002 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1003 command->enable = enable;
1006 DEFCOMMAND(6, ScissorTest, int enable;)
1007 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1009 thread->scissortest = command->enable;
1010 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1012 void DPSOFTRAST_ScissorTest(int enable)
1014 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1015 command->enable = enable;
1018 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1019 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1021 thread->scissor[0] = command->x;
1022 thread->scissor[1] = command->y;
1023 thread->scissor[2] = command->width;
1024 thread->scissor[3] = command->height;
1025 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1027 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1029 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1032 command->width = width;
1033 command->height = height;
1036 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1037 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1039 thread->blendfunc[0] = command->sfactor;
1040 thread->blendfunc[1] = command->dfactor;
1041 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1043 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1045 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1046 command->sfactor = sfactor;
1047 command->dfactor = dfactor;
1050 DEFCOMMAND(9, BlendSubtract, int enable;)
1051 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1053 thread->blendsubtract = command->enable;
1054 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1056 void DPSOFTRAST_BlendSubtract(int enable)
1058 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1059 command->enable = enable;
1062 DEFCOMMAND(10, DepthMask, int enable;)
1063 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1065 thread->depthmask = command->enable;
1067 void DPSOFTRAST_DepthMask(int enable)
1069 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1070 command->enable = enable;
1073 DEFCOMMAND(11, DepthFunc, int func;)
1074 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1076 thread->depthfunc = command->func;
1078 void DPSOFTRAST_DepthFunc(int func)
1080 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1081 command->func = func;
1084 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1085 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1087 thread->depthrange[0] = command->nearval;
1088 thread->depthrange[1] = command->farval;
1090 void DPSOFTRAST_DepthRange(float nearval, float farval)
1092 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1093 command->nearval = nearval;
1094 command->farval = farval;
1097 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1098 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1100 thread->polygonoffset[0] = command->alongnormal;
1101 thread->polygonoffset[1] = command->intoview;
1103 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1105 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1106 command->alongnormal = alongnormal;
1107 command->intoview = intoview;
1110 DEFCOMMAND(14, CullFace, int mode;)
1111 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1113 thread->cullface = command->mode;
1115 void DPSOFTRAST_CullFace(int mode)
1117 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1118 command->mode = mode;
1121 DEFCOMMAND(15, AlphaTest, int enable;)
1122 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1124 thread->alphatest = command->enable;
1126 void DPSOFTRAST_AlphaTest(int enable)
1128 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1129 command->enable = enable;
1132 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1133 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1135 thread->alphafunc = command->func;
1136 thread->alphavalue = command->ref;
1138 void DPSOFTRAST_AlphaFunc(int func, float ref)
1140 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1141 command->func = func;
1145 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1147 dpsoftrast.color[0] = r;
1148 dpsoftrast.color[1] = g;
1149 dpsoftrast.color[2] = b;
1150 dpsoftrast.color[3] = a;
1153 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1155 int outstride = blockwidth * 4;
1156 int instride = dpsoftrast.fb_width * 4;
1159 int bx2 = blockx + blockwidth;
1160 int by2 = blocky + blockheight;
1164 unsigned char *inpixels;
1168 if (bx1 < 0) bx1 = 0;
1169 if (by1 < 0) by1 = 0;
1170 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1171 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1173 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1174 if (dpsoftrast.bigendian)
1176 for (y = by1;y < by2;y++)
1178 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1179 o = (unsigned char *)outpixels + (y - by1) * outstride;
1180 for (x = bx1;x < bx2;x++)
1193 for (y = by1;y < by2;y++)
1195 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1196 o = (unsigned char *)outpixels + (y - by1) * outstride;
1202 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1206 int tx2 = tx + width;
1207 int ty2 = ty + height;
1210 int sx2 = sx + width;
1211 int sy2 = sy + height;
1221 unsigned int *spixels;
1222 unsigned int *tpixels;
1223 DPSOFTRAST_Texture *texture;
1224 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1225 if (mip < 0 || mip >= texture->mipmaps) return;
1227 spixels = dpsoftrast.fb_colorpixels[0];
1228 swidth = dpsoftrast.fb_width;
1229 sheight = dpsoftrast.fb_height;
1230 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1231 twidth = texture->mipmap[mip][2];
1232 theight = texture->mipmap[mip][3];
1233 if (tx1 < 0) tx1 = 0;
1234 if (ty1 < 0) ty1 = 0;
1235 if (tx2 > twidth) tx2 = twidth;
1236 if (ty2 > theight) ty2 = theight;
1237 if (sx1 < 0) sx1 = 0;
1238 if (sy1 < 0) sy1 = 0;
1239 if (sx2 > swidth) sx2 = swidth;
1240 if (sy2 > sheight) sy2 = sheight;
1245 if (tw > sw) tw = sw;
1246 if (th > sh) th = sh;
1247 if (tw < 1 || th < 1)
1249 sy1 = sheight - 1 - sy1;
1250 for (y = 0;y < th;y++)
1251 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1252 if (texture->mipmaps > 1)
1253 DPSOFTRAST_Texture_CalculateMipmaps(index);
1256 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1257 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1259 if (thread->texbound[command->unitnum])
1260 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1261 thread->texbound[command->unitnum] = command->texture;
1263 void DPSOFTRAST_SetTexture(int unitnum, int index)
1265 DPSOFTRAST_Command_SetTexture *command;
1266 DPSOFTRAST_Texture *texture;
1267 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1269 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1272 texture = DPSOFTRAST_Texture_GetByIndex(index);
1273 if (index && !texture)
1275 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1279 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1280 command->unitnum = unitnum;
1281 command->texture = texture;
1283 dpsoftrast.texbound[unitnum] = texture;
1284 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1287 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1289 dpsoftrast.pointer_vertex3f = vertex3f;
1290 dpsoftrast.stride_vertex = stride;
1292 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1294 dpsoftrast.pointer_color4f = color4f;
1295 dpsoftrast.pointer_color4ub = NULL;
1296 dpsoftrast.stride_color = stride;
1298 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1300 dpsoftrast.pointer_color4f = NULL;
1301 dpsoftrast.pointer_color4ub = color4ub;
1302 dpsoftrast.stride_color = stride;
1304 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1306 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1307 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1308 dpsoftrast.stride_texcoord[unitnum] = stride;
1311 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1312 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1314 thread->shader_mode = command->mode;
1315 thread->shader_permutation = command->permutation;
1317 void DPSOFTRAST_SetShader(int mode, int permutation)
1319 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1320 command->mode = mode;
1321 command->permutation = permutation;
1323 dpsoftrast.shader_mode = mode;
1324 dpsoftrast.shader_permutation = permutation;
1327 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1328 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1330 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1332 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1334 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1335 command->index = index;
1336 command->val[0] = v0;
1337 command->val[1] = v1;
1338 command->val[2] = v2;
1339 command->val[3] = v3;
1341 dpsoftrast.uniform4f[index*4+0] = v0;
1342 dpsoftrast.uniform4f[index*4+1] = v1;
1343 dpsoftrast.uniform4f[index*4+2] = v2;
1344 dpsoftrast.uniform4f[index*4+3] = v3;
1346 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1348 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1349 command->index = index;
1350 memcpy(command->val, v, sizeof(command->val));
1352 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1355 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1356 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1358 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1360 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1364 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1366 __m128 m0, m1, m2, m3;
1367 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1368 command->index = (DPSOFTRAST_UNIFORM)index;
1369 if (((size_t)v)&(ALIGN_SIZE-1))
1371 m0 = _mm_loadu_ps(v);
1372 m1 = _mm_loadu_ps(v+4);
1373 m2 = _mm_loadu_ps(v+8);
1374 m3 = _mm_loadu_ps(v+12);
1378 m0 = _mm_load_ps(v);
1379 m1 = _mm_load_ps(v+4);
1380 m2 = _mm_load_ps(v+8);
1381 m3 = _mm_load_ps(v+12);
1385 __m128 t0, t1, t2, t3;
1386 t0 = _mm_unpacklo_ps(m0, m1);
1387 t1 = _mm_unpacklo_ps(m2, m3);
1388 t2 = _mm_unpackhi_ps(m0, m1);
1389 t3 = _mm_unpackhi_ps(m2, m3);
1390 m0 = _mm_movelh_ps(t0, t1);
1391 m1 = _mm_movehl_ps(t1, t0);
1392 m2 = _mm_movelh_ps(t2, t3);
1393 m3 = _mm_movehl_ps(t3, t2);
1395 _mm_store_ps(command->val, m0);
1396 _mm_store_ps(command->val+4, m1);
1397 _mm_store_ps(command->val+8, m2);
1398 _mm_store_ps(command->val+12, m3);
1399 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1400 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1401 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1402 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1407 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1408 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1410 thread->uniform1i[command->index] = command->val;
1412 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1414 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1415 command->index = index;
1418 dpsoftrast.uniform1i[command->index] = i0;
1422 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1424 float *end = dst + size*4;
1425 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1429 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1438 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1445 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1447 float *end = dst + size*4;
1448 if (stride == sizeof(float[3]))
1450 float *end4 = dst + (size&~3)*4;
1451 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1455 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1456 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1457 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1458 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1459 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1460 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1461 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1462 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1463 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1464 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1465 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1466 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1467 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1469 src += 4*sizeof(float[3]);
1476 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1477 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1478 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1479 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1480 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1481 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1482 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1483 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1484 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1485 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1486 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1487 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1488 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1490 src += 4*sizeof(float[3]);
1494 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1498 __m128 v = _mm_loadu_ps((const float *)src);
1499 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1500 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1501 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1502 _mm_store_ps(dst, v);
1511 __m128 v = _mm_load_ps((const float *)src);
1512 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1513 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1514 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1515 _mm_store_ps(dst, v);
1522 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1524 float *end = dst + size*4;
1525 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1526 if (stride == sizeof(float[2]))
1528 float *end2 = dst + (size&~1)*4;
1529 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1533 __m128 v = _mm_loadu_ps((const float *)src);
1534 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1535 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1537 src += 2*sizeof(float[2]);
1544 __m128 v = _mm_load_ps((const float *)src);
1545 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1546 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1548 src += 2*sizeof(float[2]);
1554 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1560 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1562 float *end = dst + size*4;
1563 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1564 if (stride == sizeof(unsigned char[4]))
1566 float *end4 = dst + (size&~3)*4;
1567 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1571 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1572 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1573 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1574 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1575 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1577 src += 4*sizeof(unsigned char[4]);
1584 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1585 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1586 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1587 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1588 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1590 src += 4*sizeof(unsigned char[4]);
1596 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1597 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1603 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1605 float *end = dst + 4*size;
1606 __m128 v = _mm_loadu_ps(src);
1609 _mm_store_ps(dst, v);
1615 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1618 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1619 __m128 m0, m1, m2, m3;
1621 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1623 // fast case for identity matrix
1624 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1627 end = out4f + numitems*4;
1628 m0 = _mm_loadu_ps(inmatrix16f);
1629 m1 = _mm_loadu_ps(inmatrix16f + 4);
1630 m2 = _mm_loadu_ps(inmatrix16f + 8);
1631 m3 = _mm_loadu_ps(inmatrix16f + 12);
1632 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1636 __m128 v = _mm_loadu_ps(in4f);
1638 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1639 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1640 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1641 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1650 __m128 v = _mm_load_ps(in4f);
1652 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1653 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1654 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1655 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1663 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1665 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1669 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1671 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1672 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1673 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1674 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1677 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1679 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1680 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1681 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1682 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1685 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1688 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1689 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1690 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1691 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1694 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1696 int clipmask = 0xFF;
1697 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1698 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1699 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1700 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1701 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1702 #define BBFRONT(k, pos) \
1704 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1705 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1706 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1709 clipmask &= ~(1<<k); \
1710 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1711 minproj = _mm_min_ss(minproj, proj); \
1712 maxproj = _mm_max_ss(maxproj, proj); \
1716 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1717 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1718 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1719 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1720 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1721 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1725 if (clipmask&(1<<k)) \
1727 if (!(clipmask&(1<<(k^1)))) \
1729 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1730 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1731 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1732 minproj = _mm_min_ss(minproj, proj); \
1733 maxproj = _mm_max_ss(maxproj, proj); \
1735 if (!(clipmask&(1<<(k^2)))) \
1737 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1738 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1739 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1740 minproj = _mm_min_ss(minproj, proj); \
1741 maxproj = _mm_max_ss(maxproj, proj); \
1743 if (!(clipmask&(1<<(k^4)))) \
1745 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1746 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1747 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1748 minproj = _mm_min_ss(minproj, proj); \
1749 maxproj = _mm_max_ss(maxproj, proj); \
1753 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1754 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1755 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1756 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1757 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1758 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1759 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1760 *starty = _mm_cvttss_si32(maxproj);
1761 *endy = _mm_cvttss_si32(minproj)+1;
1765 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1767 float *end = out4f + numitems*4;
1768 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1769 __m128 minpos, maxpos;
1770 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1772 minpos = maxpos = _mm_loadu_ps(in4f);
1775 __m128 v = _mm_loadu_ps(in4f);
1776 minpos = _mm_min_ps(minpos, v);
1777 maxpos = _mm_max_ps(maxpos, v);
1778 _mm_store_ps(out4f, v);
1779 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1780 _mm_store_ps(screen4f, v);
1788 minpos = maxpos = _mm_load_ps(in4f);
1791 __m128 v = _mm_load_ps(in4f);
1792 minpos = _mm_min_ps(minpos, v);
1793 maxpos = _mm_max_ps(maxpos, v);
1794 _mm_store_ps(out4f, v);
1795 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1796 _mm_store_ps(screen4f, v);
1803 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale,
1804 _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1805 _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1806 _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1807 _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1811 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1813 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1814 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1816 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1817 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1818 end = out4f + numitems*4;
1819 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1820 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1821 m0 = _mm_loadu_ps(inmatrix16f);
1822 m1 = _mm_loadu_ps(inmatrix16f + 4);
1823 m2 = _mm_loadu_ps(inmatrix16f + 8);
1824 m3 = _mm_loadu_ps(inmatrix16f + 12);
1825 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1827 minpos = maxpos = _mm_loadu_ps(in4f);
1830 __m128 v = _mm_loadu_ps(in4f);
1831 minpos = _mm_min_ps(minpos, v);
1832 maxpos = _mm_max_ps(maxpos, v);
1833 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1834 _mm_store_ps(out4f, v);
1835 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1836 _mm_store_ps(screen4f, v);
1844 minpos = maxpos = _mm_load_ps(in4f);
1847 __m128 v = _mm_load_ps(in4f);
1848 minpos = _mm_min_ps(minpos, v);
1849 maxpos = _mm_max_ps(maxpos, v);
1850 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1851 _mm_store_ps(out4f, v);
1852 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1853 _mm_store_ps(screen4f, v);
1860 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3);
1865 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1868 float *outf = dpsoftrast.post_array4f[outarray];
1869 const unsigned char *inb;
1870 int firstvertex = dpsoftrast.firstvertex;
1871 int numvertices = dpsoftrast.numvertices;
1875 case DPSOFTRAST_ARRAY_POSITION:
1876 stride = dpsoftrast.stride_vertex;
1877 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1878 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1880 case DPSOFTRAST_ARRAY_COLOR:
1881 stride = dpsoftrast.stride_color;
1882 if (dpsoftrast.pointer_color4f)
1884 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1885 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1887 else if (dpsoftrast.pointer_color4ub)
1889 stride = dpsoftrast.stride_color;
1890 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1891 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1895 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1899 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1900 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1902 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1903 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1906 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1909 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1912 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1924 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1926 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1927 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1932 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1935 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1936 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1944 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1947 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1948 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1955 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1958 int startx = span->startx;
1959 int endx = span->endx;
1960 float wslope = triangle->w[0];
1961 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1962 float endz = 1.0f / (w + wslope * startx);
1963 for (x = startx;x < endx;)
1965 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1967 if (nextsub >= endx) nextsub = endsub = endx-1;
1968 endz = 1.0f / (w + wslope * nextsub);
1969 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1970 for (; x <= endsub; x++, z += dz)
1975 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1978 int startx = span->startx;
1979 int endx = span->endx;
1982 unsigned char * RESTRICT pixelmask = span->pixelmask;
1983 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1986 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1987 // handle alphatest now (this affects depth writes too)
1988 if (thread->alphatest)
1989 for (x = startx;x < endx;x++)
1990 if (in4f[x*4+3] < 0.5f)
1991 pixelmask[x] = false;
1992 // FIXME: this does not handle bigendian
1993 switch(thread->fb_blendmode)
1995 case DPSOFTRAST_BLENDMODE_OPAQUE:
1996 for (x = startx;x < endx;x++)
2000 d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
2001 d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
2002 d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
2003 d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
2004 pixel[x*4+0] = d[0];
2005 pixel[x*4+1] = d[1];
2006 pixel[x*4+2] = d[2];
2007 pixel[x*4+3] = d[3];
2010 case DPSOFTRAST_BLENDMODE_ALPHA:
2011 for (x = startx;x < endx;x++)
2015 a = in4f[x*4+3] * 255.0f;
2016 b = 1.0f - in4f[x*4+3];
2017 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2018 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2019 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2020 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2021 pixel[x*4+0] = d[0];
2022 pixel[x*4+1] = d[1];
2023 pixel[x*4+2] = d[2];
2024 pixel[x*4+3] = d[3];
2027 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2028 for (x = startx;x < endx;x++)
2032 a = in4f[x*4+3] * 255.0f;
2033 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2034 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2035 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2036 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2037 pixel[x*4+0] = d[0];
2038 pixel[x*4+1] = d[1];
2039 pixel[x*4+2] = d[2];
2040 pixel[x*4+3] = d[3];
2043 case DPSOFTRAST_BLENDMODE_ADD:
2044 for (x = startx;x < endx;x++)
2048 d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2049 d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2050 d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2051 d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2052 pixel[x*4+0] = d[0];
2053 pixel[x*4+1] = d[1];
2054 pixel[x*4+2] = d[2];
2055 pixel[x*4+3] = d[3];
2058 case DPSOFTRAST_BLENDMODE_INVMOD:
2059 for (x = startx;x < endx;x++)
2063 d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2064 d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2065 d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2066 d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2067 pixel[x*4+0] = d[0];
2068 pixel[x*4+1] = d[1];
2069 pixel[x*4+2] = d[2];
2070 pixel[x*4+3] = d[3];
2073 case DPSOFTRAST_BLENDMODE_MUL:
2074 for (x = startx;x < endx;x++)
2078 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2079 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2080 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2081 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2082 pixel[x*4+0] = d[0];
2083 pixel[x*4+1] = d[1];
2084 pixel[x*4+2] = d[2];
2085 pixel[x*4+3] = d[3];
2088 case DPSOFTRAST_BLENDMODE_MUL2:
2089 for (x = startx;x < endx;x++)
2093 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2094 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2095 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2096 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2097 pixel[x*4+0] = d[0];
2098 pixel[x*4+1] = d[1];
2099 pixel[x*4+2] = d[2];
2100 pixel[x*4+3] = d[3];
2103 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2104 for (x = startx;x < endx;x++)
2108 a = in4f[x*4+3] * -255.0f;
2109 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2110 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2111 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2112 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2113 pixel[x*4+0] = d[0];
2114 pixel[x*4+1] = d[1];
2115 pixel[x*4+2] = d[2];
2116 pixel[x*4+3] = d[3];
2119 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2120 for (x = startx;x < endx;x++)
2125 b = 1.0f - in4f[x*4+3];
2126 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2127 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2128 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2129 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2130 pixel[x*4+0] = d[0];
2131 pixel[x*4+1] = d[1];
2132 pixel[x*4+2] = d[2];
2133 pixel[x*4+3] = d[3];
2136 case DPSOFTRAST_BLENDMODE_INVADD:
2137 for (x = startx;x < endx;x++)
2141 d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2142 d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2143 d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2144 d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2145 pixel[x*4+0] = d[0];
2146 pixel[x*4+1] = d[1];
2147 pixel[x*4+2] = d[2];
2148 pixel[x*4+3] = d[3];
2154 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2158 int startx = span->startx;
2159 int endx = span->endx;
2160 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2161 unsigned char * RESTRICT pixelmask = span->pixelmask;
2162 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2163 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2166 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2167 pixeli += span->y * dpsoftrast.fb_width + span->x;
2168 // handle alphatest now (this affects depth writes too)
2169 if (thread->alphatest)
2170 for (x = startx;x < endx;x++)
2171 if (in4ub[x*4+3] < 0.5f)
2172 pixelmask[x] = false;
2173 // FIXME: this does not handle bigendian
2174 switch(thread->fb_blendmode)
2176 case DPSOFTRAST_BLENDMODE_OPAQUE:
2177 for (x = startx;x + 4 <= endx;)
2179 if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2181 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2195 case DPSOFTRAST_BLENDMODE_ALPHA:
2196 #define FINISHBLEND(blend2, blend1) \
2197 for (x = startx;x + 1 < endx;x += 2) \
2200 switch (*(const unsigned short*)&pixelmask[x]) \
2203 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2204 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2206 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2209 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2210 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2212 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2215 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2216 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2218 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2223 for(;x < endx; x++) \
2226 if (!pixelmask[x]) \
2228 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2229 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2231 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2235 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2236 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2238 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2239 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2242 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2244 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2245 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2247 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2248 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2251 case DPSOFTRAST_BLENDMODE_ADD:
2252 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2254 case DPSOFTRAST_BLENDMODE_INVMOD:
2256 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2258 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2261 case DPSOFTRAST_BLENDMODE_MUL:
2262 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2264 case DPSOFTRAST_BLENDMODE_MUL2:
2265 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2267 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2269 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2270 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2272 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2273 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2276 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2278 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2279 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2281 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2282 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2285 case DPSOFTRAST_BLENDMODE_INVADD:
2287 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2289 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2296 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2299 int startx = span->startx;
2300 int endx = span->endx;
2305 float tc[2], endtc[2];
2307 unsigned int tci[2];
2308 unsigned int tci1[2];
2309 unsigned int tcimin[2];
2310 unsigned int tcimax[2];
2315 const unsigned char * RESTRICT pixelbase;
2316 const unsigned char * RESTRICT pixel[4];
2317 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2318 // if no texture is bound, just fill it with white
2321 for (x = startx;x < endx;x++)
2323 out4f[x*4+0] = 1.0f;
2324 out4f[x*4+1] = 1.0f;
2325 out4f[x*4+2] = 1.0f;
2326 out4f[x*4+3] = 1.0f;
2330 mip = triangle->mip[texunitindex];
2331 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2332 // if this mipmap of the texture is 1 pixel, just fill it with that color
2333 if (texture->mipmap[mip][1] == 4)
2335 c[0] = texture->bytes[2] * (1.0f/255.0f);
2336 c[1] = texture->bytes[1] * (1.0f/255.0f);
2337 c[2] = texture->bytes[0] * (1.0f/255.0f);
2338 c[3] = texture->bytes[3] * (1.0f/255.0f);
2339 for (x = startx;x < endx;x++)
2341 out4f[x*4+0] = c[0];
2342 out4f[x*4+1] = c[1];
2343 out4f[x*4+2] = c[2];
2344 out4f[x*4+3] = c[3];
2348 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2349 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2350 flags = texture->flags;
2351 tcscale[0] = texture->mipmap[mip][2];
2352 tcscale[1] = texture->mipmap[mip][3];
2353 tciwidth = texture->mipmap[mip][2];
2356 tcimax[0] = texture->mipmap[mip][2]-1;
2357 tcimax[1] = texture->mipmap[mip][3]-1;
2358 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2359 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2360 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2361 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2362 for (x = startx;x < endx;)
2364 unsigned int subtc[2];
2365 unsigned int substep[2];
2366 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2367 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2368 if (nextsub >= endx)
2370 nextsub = endsub = endx-1;
2371 if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2375 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2376 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2377 substep[0] = (endtc[0] - tc[0]) * subscale;
2378 substep[1] = (endtc[1] - tc[1]) * subscale;
2379 subtc[0] = tc[0] * (1<<16);
2380 subtc[1] = tc[1] * (1<<16);
2383 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2385 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2387 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2388 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2389 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2390 tci[0] = subtc[0]>>16;
2391 tci[1] = subtc[1]>>16;
2392 tci1[0] = tci[0] + 1;
2393 tci1[1] = tci[1] + 1;
2394 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2395 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2396 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2397 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2398 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2399 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2400 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2401 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2402 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2403 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2404 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2405 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2406 out4f[x*4+0] = c[0];
2407 out4f[x*4+1] = c[1];
2408 out4f[x*4+2] = c[2];
2409 out4f[x*4+3] = c[3];
2414 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2416 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2417 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2418 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2419 tci[0] = subtc[0]>>16;
2420 tci[1] = subtc[1]>>16;
2421 tci1[0] = tci[0] + 1;
2422 tci1[1] = tci[1] + 1;
2423 tci[0] &= tciwrapmask[0];
2424 tci[1] &= tciwrapmask[1];
2425 tci1[0] &= tciwrapmask[0];
2426 tci1[1] &= tciwrapmask[1];
2427 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2428 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2429 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2430 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2431 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2432 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2433 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2434 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2435 out4f[x*4+0] = c[0];
2436 out4f[x*4+1] = c[1];
2437 out4f[x*4+2] = c[2];
2438 out4f[x*4+3] = c[3];
2442 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2444 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2446 tci[0] = subtc[0]>>16;
2447 tci[1] = subtc[1]>>16;
2448 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2449 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2450 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2451 c[0] = pixel[0][2] * (1.0f / 255.0f);
2452 c[1] = pixel[0][1] * (1.0f / 255.0f);
2453 c[2] = pixel[0][0] * (1.0f / 255.0f);
2454 c[3] = pixel[0][3] * (1.0f / 255.0f);
2455 out4f[x*4+0] = c[0];
2456 out4f[x*4+1] = c[1];
2457 out4f[x*4+2] = c[2];
2458 out4f[x*4+3] = c[3];
2463 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2465 tci[0] = subtc[0]>>16;
2466 tci[1] = subtc[1]>>16;
2467 tci[0] &= tciwrapmask[0];
2468 tci[1] &= tciwrapmask[1];
2469 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2470 c[0] = pixel[0][2] * (1.0f / 255.0f);
2471 c[1] = pixel[0][1] * (1.0f / 255.0f);
2472 c[2] = pixel[0][0] * (1.0f / 255.0f);
2473 c[3] = pixel[0][3] * (1.0f / 255.0f);
2474 out4f[x*4+0] = c[0];
2475 out4f[x*4+1] = c[1];
2476 out4f[x*4+2] = c[2];
2477 out4f[x*4+3] = c[3];
2483 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2487 int startx = span->startx;
2488 int endx = span->endx;
2490 __m128 data, slope, tcscale;
2491 __m128i tcsize, tcmask, tcoffset, tcmax;
2493 __m128i subtc, substep, endsubtc;
2496 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2497 const unsigned char * RESTRICT pixelbase;
2498 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2499 // if no texture is bound, just fill it with white
2502 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2505 mip = triangle->mip[texunitindex];
2506 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2507 // if this mipmap of the texture is 1 pixel, just fill it with that color
2508 if (texture->mipmap[mip][1] == 4)
2510 unsigned int k = *((const unsigned int *)pixelbase);
2511 for (x = startx;x < endx;x++)
2515 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2516 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2517 flags = texture->flags;
2518 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2519 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2520 tcscale = _mm_cvtepi32_ps(tcsize);
2521 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2522 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2523 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2524 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2525 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2526 tcmax = _mm_packs_epi32(tcmask, tcmask);
2527 for (x = startx;x < endx;)
2529 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2530 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2531 if (nextsub >= endx)
2533 nextsub = endsub = endx-1;
2534 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2538 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2539 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2540 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2541 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2542 substep = _mm_slli_epi32(substep, 1);
2545 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2546 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2548 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2549 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2551 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2552 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2553 tci = _mm_madd_epi16(tci, tcoffset);
2554 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2555 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2556 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2557 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2558 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2559 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2560 fracm = _mm_srli_epi16(subtc, 1);
2561 pix1 = _mm_add_epi16(pix1,
2562 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2563 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2564 pix3 = _mm_add_epi16(pix3,
2565 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2566 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2567 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2568 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2569 pix2 = _mm_add_epi16(pix2,
2570 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2571 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2572 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2576 const unsigned char * RESTRICT ptr1;
2577 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2578 tci = _mm_madd_epi16(tci, tcoffset);
2579 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2580 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2581 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2582 fracm = _mm_srli_epi16(subtc, 1);
2583 pix1 = _mm_add_epi16(pix1,
2584 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2585 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2586 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2587 pix1 = _mm_add_epi16(pix1,
2588 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2589 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2590 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2594 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2596 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2598 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2599 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2600 tci = _mm_madd_epi16(tci, tcoffset);
2601 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2602 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2603 _mm_setzero_si128());
2604 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2605 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2606 _mm_setzero_si128());
2607 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2608 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2609 tci = _mm_madd_epi16(tci, tcoffset);
2610 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2611 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2612 _mm_setzero_si128());
2613 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2614 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2615 _mm_setzero_si128());
2616 fracm = _mm_srli_epi16(subtc, 1);
2617 pix1 = _mm_add_epi16(pix1,
2618 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2619 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2620 pix3 = _mm_add_epi16(pix3,
2621 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2622 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2623 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2624 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2625 pix2 = _mm_add_epi16(pix2,
2626 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2627 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2628 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2632 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2633 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2634 tci = _mm_madd_epi16(tci, tcoffset);
2635 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2636 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2637 _mm_setzero_si128());
2638 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2639 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2640 _mm_setzero_si128());
2641 fracm = _mm_srli_epi16(subtc, 1);
2642 pix1 = _mm_add_epi16(pix1,
2643 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2644 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2645 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2646 pix1 = _mm_add_epi16(pix1,
2647 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2648 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2649 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2655 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2657 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2658 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2659 tci = _mm_madd_epi16(tci, tcoffset);
2660 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2661 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2662 _mm_setzero_si128());
2663 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2664 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2665 _mm_setzero_si128());
2666 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2667 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2668 tci = _mm_madd_epi16(tci, tcoffset);
2669 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2670 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2671 _mm_setzero_si128());
2672 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2673 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2674 _mm_setzero_si128());
2675 fracm = _mm_srli_epi16(subtc, 1);
2676 pix1 = _mm_add_epi16(pix1,
2677 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2678 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2679 pix3 = _mm_add_epi16(pix3,
2680 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2681 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2682 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2683 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2684 pix2 = _mm_add_epi16(pix2,
2685 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2686 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2687 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2691 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2692 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2693 tci = _mm_madd_epi16(tci, tcoffset);
2694 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2695 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2696 _mm_setzero_si128());
2697 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2698 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2699 _mm_setzero_si128());
2700 fracm = _mm_srli_epi16(subtc, 1);
2701 pix1 = _mm_add_epi16(pix1,
2702 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2703 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2704 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2705 pix1 = _mm_add_epi16(pix1,
2706 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2707 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2708 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2715 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2717 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2719 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2720 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2721 tci = _mm_madd_epi16(tci, tcoffset);
2722 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2723 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2727 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2728 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2729 tci = _mm_madd_epi16(tci, tcoffset);
2730 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2736 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2738 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2739 tci = _mm_and_si128(tci, tcmax);
2740 tci = _mm_madd_epi16(tci, tcoffset);
2741 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2742 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2746 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2747 tci = _mm_and_si128(tci, tcmax);
2748 tci = _mm_madd_epi16(tci, tcoffset);
2749 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2758 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2761 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2764 float DPSOFTRAST_SampleShadowmap(const float *vector)
2770 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2773 int startx = span->startx;
2774 int endx = span->endx;
2779 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2780 for (x = startx;x < endx;x++)
2783 c[0] = (data[0] + slope[0]*x) * z;
2784 c[1] = (data[1] + slope[1]*x) * z;
2785 c[2] = (data[2] + slope[2]*x) * z;
2786 c[3] = (data[3] + slope[3]*x) * z;
2787 out4f[x*4+0] = in4f[x*4+0] * c[0];
2788 out4f[x*4+1] = in4f[x*4+1] * c[1];
2789 out4f[x*4+2] = in4f[x*4+2] * c[2];
2790 out4f[x*4+3] = in4f[x*4+3] * c[3];
2794 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2797 int startx = span->startx;
2798 int endx = span->endx;
2803 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2804 for (x = startx;x < endx;x++)
2807 c[0] = (data[0] + slope[0]*x) * z;
2808 c[1] = (data[1] + slope[1]*x) * z;
2809 c[2] = (data[2] + slope[2]*x) * z;
2810 c[3] = (data[3] + slope[3]*x) * z;
2811 out4f[x*4+0] = c[0];
2812 out4f[x*4+1] = c[1];
2813 out4f[x*4+2] = c[2];
2814 out4f[x*4+3] = c[3];
2818 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2820 int x, startx = span->startx, endx = span->endx;
2821 float c[4], localcolor[4];
2822 localcolor[0] = subcolor[0];
2823 localcolor[1] = subcolor[1];
2824 localcolor[2] = subcolor[2];
2825 localcolor[3] = subcolor[3];
2826 for (x = startx;x < endx;x++)
2828 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2829 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2830 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2831 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2832 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2833 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2834 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2835 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2839 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2841 int x, startx = span->startx, endx = span->endx;
2842 for (x = startx;x < endx;x++)
2844 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2845 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2846 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2847 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2851 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2853 int x, startx = span->startx, endx = span->endx;
2854 for (x = startx;x < endx;x++)
2856 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2857 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2858 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2859 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2863 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2865 int x, startx = span->startx, endx = span->endx;
2867 for (x = startx;x < endx;x++)
2869 a = 1.0f - inb4f[x*4+3];
2871 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2872 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2873 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2874 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2878 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2880 int x, startx = span->startx, endx = span->endx;
2881 float localcolor[4], ilerp, lerp;
2882 localcolor[0] = color[0];
2883 localcolor[1] = color[1];
2884 localcolor[2] = color[2];
2885 localcolor[3] = color[3];
2886 ilerp = 1.0f - localcolor[3];
2887 lerp = localcolor[3];
2888 for (x = startx;x < endx;x++)
2890 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2891 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2892 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2893 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2899 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2903 int startx = span->startx;
2904 int endx = span->endx;
2907 __m128i submod, substep, endsubmod;
2908 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2909 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2910 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2911 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2912 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2913 for (x = startx; x < endx;)
2915 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2916 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2917 if (nextsub >= endx)
2919 nextsub = endsub = endx-1;
2920 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2924 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2925 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2926 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2927 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2928 substep = _mm_packs_epi32(substep, substep);
2929 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2931 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2932 pix = _mm_mulhi_epu16(pix, submod);
2933 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2937 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2938 pix = _mm_mulhi_epu16(pix, submod);
2939 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2946 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2950 int startx = span->startx;
2951 int endx = span->endx;
2954 __m128i submod, substep, endsubmod;
2955 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2956 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2957 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2958 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2959 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2960 for (x = startx; x < endx;)
2962 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2963 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2964 if (nextsub >= endx)
2966 nextsub = endsub = endx-1;
2967 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2971 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2972 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2973 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2974 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2975 substep = _mm_packs_epi32(substep, substep);
2976 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2978 __m128i pix = _mm_srai_epi16(submod, 4);
2979 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2983 __m128i pix = _mm_srai_epi16(submod, 4);
2984 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2991 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2994 int x, startx = span->startx, endx = span->endx;
2995 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2996 localcolor = _mm_packs_epi32(localcolor, localcolor);
2997 for (x = startx;x+2 <= endx;x+=2)
2999 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3000 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3001 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3002 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3006 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3007 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3008 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3009 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3014 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3017 int x, startx = span->startx, endx = span->endx;
3018 for (x = startx;x+2 <= endx;x+=2)
3020 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3021 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3022 pix1 = _mm_mulhi_epu16(pix1, pix2);
3023 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3027 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3028 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3029 pix1 = _mm_mulhi_epu16(pix1, pix2);
3030 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3035 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3038 int x, startx = span->startx, endx = span->endx;
3039 for (x = startx;x+2 <= endx;x+=2)
3041 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3042 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3043 pix1 = _mm_add_epi16(pix1, pix2);
3044 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3048 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3049 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3050 pix1 = _mm_add_epi16(pix1, pix2);
3051 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3056 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3059 int x, startx = span->startx, endx = span->endx;
3060 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3061 tint = _mm_packs_epi32(tint, tint);
3062 for (x = startx;x+2 <= endx;x+=2)
3064 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3065 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3066 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3067 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3071 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3072 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3073 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3074 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3079 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3082 int x, startx = span->startx, endx = span->endx;
3083 for (x = startx;x+2 <= endx;x+=2)
3085 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3086 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3087 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3088 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3089 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3093 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3094 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3095 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3096 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3097 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3102 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3105 int x, startx = span->startx, endx = span->endx;
3106 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3107 localcolor = _mm_packs_epi32(localcolor, localcolor);
3108 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3109 for (x = startx;x+2 <= endx;x+=2)
3111 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3112 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3113 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3117 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3118 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3119 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3126 void DPSOFTRAST_VertexShader_Generic(void)
3128 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3129 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3130 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3131 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3132 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3135 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3137 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3138 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3139 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3140 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3141 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3142 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3144 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3145 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3146 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3148 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3149 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3152 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3154 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3157 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3159 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3162 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3167 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3168 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3173 void DPSOFTRAST_VertexShader_PostProcess(void)
3175 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3176 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3177 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3180 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3182 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3183 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3184 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3185 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3186 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3187 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3188 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3190 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3191 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3193 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3194 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3196 // TODO: implement saturation
3198 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3200 // TODO: implement gammaramps
3202 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3207 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3209 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3212 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3214 // this is never called (because colormask is off when this shader is used)
3215 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3216 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3217 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3218 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3219 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3224 void DPSOFTRAST_VertexShader_FlatColor(void)
3226 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3227 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3230 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3233 unsigned char * RESTRICT pixelmask = span->pixelmask;
3234 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3235 int x, startx = span->startx, endx = span->endx;
3236 __m128i Color_Ambientm;
3237 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3238 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3239 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3240 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3241 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3242 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3243 pixel = buffer_FragColorbgra8;
3244 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3245 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3246 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3247 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3248 for (x = startx;x < endx;x++)
3251 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3254 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3255 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3256 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3257 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3263 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3264 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3265 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3267 if (pixel == buffer_FragColorbgra8)
3268 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3274 void DPSOFTRAST_VertexShader_VertexColor(void)
3276 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3277 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3278 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3281 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3284 unsigned char * RESTRICT pixelmask = span->pixelmask;
3285 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3286 int x, startx = span->startx, endx = span->endx;
3287 __m128i Color_Ambientm, Color_Diffusem;
3289 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3290 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3291 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3292 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3293 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3294 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3295 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3296 pixel = buffer_FragColorbgra8;
3297 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3298 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3299 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3300 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3301 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3302 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3303 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3304 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3305 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3306 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3307 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3308 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3309 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3310 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3312 __m128i color, mod, pix;
3313 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3316 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3317 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3318 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3319 data = _mm_add_ps(data, slope);
3320 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3321 data = _mm_add_ps(data, slope);
3322 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3323 data = _mm_add_ps(data, slope);
3324 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3325 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3326 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3327 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3328 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3329 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3335 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3336 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3337 mod = _mm_packs_epi32(mod, mod);
3338 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3339 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3341 if (pixel == buffer_FragColorbgra8)
3342 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3348 void DPSOFTRAST_VertexShader_Lightmap(void)
3350 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3351 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3352 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3355 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3358 unsigned char * RESTRICT pixelmask = span->pixelmask;
3359 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3360 int x, startx = span->startx, endx = span->endx;
3361 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3362 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3363 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3364 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3365 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3366 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3367 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3368 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3369 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3370 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3371 pixel = buffer_FragColorbgra8;
3372 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3373 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3374 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3375 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3376 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3377 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3378 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3379 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3381 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3382 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3383 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3384 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3385 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3386 for (x = startx;x < endx;x++)
3388 __m128i color, lightmap, glow, pix;
3389 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3392 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3393 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3394 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3395 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3396 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3397 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3398 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3399 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3400 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3401 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3407 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3408 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3409 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3410 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3411 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3412 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3417 for (x = startx;x < endx;x++)
3419 __m128i color, lightmap, pix;
3420 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3423 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3424 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3425 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3426 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3427 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3428 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3429 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3435 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3436 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3437 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3438 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3441 if (pixel == buffer_FragColorbgra8)
3442 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3448 void DPSOFTRAST_VertexShader_FakeLight(void)
3450 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3453 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3456 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3457 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3458 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3459 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3460 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3465 void DPSOFTRAST_VertexShader_LightDirection(void);
3466 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3468 DPSOFTRAST_VertexShader_LightDirection();
3469 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3472 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3473 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3475 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3480 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3482 DPSOFTRAST_VertexShader_Lightmap();
3485 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3487 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3493 void DPSOFTRAST_VertexShader_LightDirection(void)
3496 int numvertices = dpsoftrast.numvertices;
3498 float LightVector[4];
3499 float EyePosition[4];
3500 float EyeVectorModelSpace[4];
3506 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3507 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3508 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3509 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3510 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3511 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3512 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3513 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3514 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3515 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3516 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3517 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3518 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3519 for (i = 0;i < numvertices;i++)
3521 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3522 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3523 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3524 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3525 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3526 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3527 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3528 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3529 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3530 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3531 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3532 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3533 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3534 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3535 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3536 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3537 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3538 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3539 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3540 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3541 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3542 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3543 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3544 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3545 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3546 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3547 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3548 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3549 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3551 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3554 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3555 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3556 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3557 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3558 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3559 #define DPSOFTRAST_Vector3Normalize(v)\
3562 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3573 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3575 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3576 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3577 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3578 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3579 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3580 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3581 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3582 unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3583 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3584 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3585 int x, startx = span->startx, endx = span->endx;
3586 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3587 float LightVectordata[4];
3588 float LightVectorslope[4];
3589 float EyeVectordata[4];
3590 float EyeVectorslope[4];
3591 float VectorSdata[4];
3592 float VectorSslope[4];
3593 float VectorTdata[4];
3594 float VectorTslope[4];
3595 float VectorRdata[4];
3596 float VectorRslope[4];
3598 float diffusetex[4];
3600 float surfacenormal[4];
3601 float lightnormal[4];
3602 float lightnormal_modelspace[4];
3604 float specularnormal[4];
3607 float SpecularPower;
3609 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3610 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3611 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3612 Color_Glow[3] = 0.0f;
3613 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3614 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3615 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3616 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3617 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3618 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3619 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3620 Color_Pants[3] = 0.0f;
3621 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3622 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3623 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3624 Color_Shirt[3] = 0.0f;
3625 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3626 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3627 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3629 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3630 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3632 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3634 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3636 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3638 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3639 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3640 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3641 Color_Diffuse[3] = 0.0f;
3642 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3643 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3644 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3645 LightColor[3] = 0.0f;
3646 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3647 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3648 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3649 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3650 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3651 Color_Specular[3] = 0.0f;
3652 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3653 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3654 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3656 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3658 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3659 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3660 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3661 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3662 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3665 for (x = startx;x < endx;x++)
3668 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3669 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3670 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3671 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3672 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3674 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3675 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3676 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3677 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3679 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3680 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3681 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3682 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3683 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3684 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3685 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3686 DPSOFTRAST_Vector3Normalize(surfacenormal);
3688 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3690 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3691 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3692 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3693 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3695 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3696 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3697 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3698 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3700 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3701 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3702 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3703 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3705 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3706 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3707 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3708 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3710 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3711 DPSOFTRAST_Vector3Normalize(lightnormal);
3713 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3715 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3716 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3717 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3718 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3723 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3724 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3725 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3726 DPSOFTRAST_Vector3Normalize(lightnormal);
3729 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3730 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3731 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3732 DPSOFTRAST_Vector3Normalize(eyenormal);
3734 specularnormal[0] = lightnormal[0] + eyenormal[0];
3735 specularnormal[1] = lightnormal[1] + eyenormal[1];
3736 specularnormal[2] = lightnormal[2] + eyenormal[2];
3737 DPSOFTRAST_Vector3Normalize(specularnormal);
3739 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3740 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3741 specular = pow(specular, SpecularPower * glosstex[3]);
3742 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3744 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3745 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3746 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3747 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3751 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3752 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3753 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3754 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3757 buffer_FragColorbgra8[x*4+0] = d[0];
3758 buffer_FragColorbgra8[x*4+1] = d[1];
3759 buffer_FragColorbgra8[x*4+2] = d[2];
3760 buffer_FragColorbgra8[x*4+3] = d[3];
3763 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3765 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3766 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3767 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3768 Color_Diffuse[3] = 0.0f;
3769 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3770 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3771 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3772 LightColor[3] = 0.0f;
3773 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3774 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3776 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3778 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3779 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3780 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3781 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3782 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3785 for (x = startx;x < endx;x++)
3788 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3789 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3790 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3791 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3792 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3793 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3794 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3795 DPSOFTRAST_Vector3Normalize(surfacenormal);
3797 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3799 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3800 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3801 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3802 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3804 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3805 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3806 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3807 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3809 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3810 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3811 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3812 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3814 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3815 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3816 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3817 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3819 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3820 DPSOFTRAST_Vector3Normalize(lightnormal);
3822 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3824 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3825 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3826 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3827 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3832 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3833 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3834 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3835 DPSOFTRAST_Vector3Normalize(lightnormal);
3838 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3839 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3841 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3842 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3843 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3844 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3848 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3849 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3850 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3851 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3853 buffer_FragColorbgra8[x*4+0] = d[0];
3854 buffer_FragColorbgra8[x*4+1] = d[1];
3855 buffer_FragColorbgra8[x*4+2] = d[2];
3856 buffer_FragColorbgra8[x*4+3] = d[3];
3861 for (x = startx;x < endx;x++)
3864 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3865 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3866 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3867 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3869 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3871 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3872 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3873 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3874 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3878 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3879 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3880 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3881 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3883 buffer_FragColorbgra8[x*4+0] = d[0];
3884 buffer_FragColorbgra8[x*4+1] = d[1];
3885 buffer_FragColorbgra8[x*4+2] = d[2];
3886 buffer_FragColorbgra8[x*4+3] = d[3];
3889 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3894 void DPSOFTRAST_VertexShader_LightSource(void)
3897 int numvertices = dpsoftrast.numvertices;
3898 float LightPosition[4];
3899 float LightVector[4];
3900 float LightVectorModelSpace[4];
3901 float EyePosition[4];
3902 float EyeVectorModelSpace[4];
3908 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3909 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3910 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3911 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3912 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3913 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3914 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3915 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3916 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3917 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3918 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3919 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3920 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3921 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3922 for (i = 0;i < numvertices;i++)
3924 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3925 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3926 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3927 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3928 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3929 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3930 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3931 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3932 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3933 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3934 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3935 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3936 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3937 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3938 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3939 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3940 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3941 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
3942 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3943 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3944 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3945 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3946 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3947 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3948 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3949 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3950 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3951 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3952 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3953 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3954 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3955 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3957 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3958 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3961 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3964 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3965 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3966 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3967 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3968 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3969 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3970 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3971 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3972 int x, startx = span->startx, endx = span->endx;
3973 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3974 float CubeVectordata[4];
3975 float CubeVectorslope[4];
3976 float LightVectordata[4];
3977 float LightVectorslope[4];
3978 float EyeVectordata[4];
3979 float EyeVectorslope[4];
3981 float diffusetex[4];
3983 float surfacenormal[4];
3984 float lightnormal[4];
3986 float specularnormal[4];
3989 float SpecularPower;
3990 float CubeVector[4];
3993 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3994 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3995 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3996 Color_Glow[3] = 0.0f;
3997 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3998 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3999 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4000 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4001 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4002 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4003 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4004 Color_Diffuse[3] = 0.0f;
4005 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4006 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4007 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4008 Color_Specular[3] = 0.0f;
4009 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4010 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4011 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4012 Color_Pants[3] = 0.0f;
4013 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4014 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4015 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4016 Color_Shirt[3] = 0.0f;
4017 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4018 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4019 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4020 LightColor[3] = 0.0f;
4021 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4022 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4023 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4024 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4025 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4026 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4027 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4028 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4030 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4031 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4033 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4034 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4035 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4037 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4038 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4039 for (x = startx;x < endx;x++)
4042 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4043 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4044 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4045 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4046 if (attenuation < 0.01f)
4048 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4050 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4051 if (attenuation < 0.01f)
4055 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4056 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4057 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4058 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4059 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4061 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4062 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4063 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4064 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4066 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4067 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4068 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4069 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4070 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4071 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4072 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4073 DPSOFTRAST_Vector3Normalize(surfacenormal);
4075 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4076 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4077 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4078 DPSOFTRAST_Vector3Normalize(lightnormal);
4080 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4081 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4082 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4083 DPSOFTRAST_Vector3Normalize(eyenormal);
4085 specularnormal[0] = lightnormal[0] + eyenormal[0];
4086 specularnormal[1] = lightnormal[1] + eyenormal[1];
4087 specularnormal[2] = lightnormal[2] + eyenormal[2];
4088 DPSOFTRAST_Vector3Normalize(specularnormal);
4090 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4091 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4092 specular = pow(specular, SpecularPower * glosstex[3]);
4093 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4095 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4096 attenuation *= (1.0f / 255.0f);
4097 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4098 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4099 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4100 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4104 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4105 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4106 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4107 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4109 buffer_FragColorbgra8[x*4+0] = d[0];
4110 buffer_FragColorbgra8[x*4+1] = d[1];
4111 buffer_FragColorbgra8[x*4+2] = d[2];
4112 buffer_FragColorbgra8[x*4+3] = d[3];
4115 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4117 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4118 for (x = startx;x < endx;x++)
4121 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4122 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4123 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4124 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4125 if (attenuation < 0.01f)
4127 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4129 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4130 if (attenuation < 0.01f)
4134 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4135 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4136 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4137 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4138 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4140 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4141 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4142 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4143 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4145 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4146 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4147 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4148 DPSOFTRAST_Vector3Normalize(surfacenormal);
4150 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4151 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4152 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4153 DPSOFTRAST_Vector3Normalize(lightnormal);
4155 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4156 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4158 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4159 attenuation *= (1.0f / 255.0f);
4160 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4161 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4162 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4163 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4167 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4168 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4169 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4170 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4172 buffer_FragColorbgra8[x*4+0] = d[0];
4173 buffer_FragColorbgra8[x*4+1] = d[1];
4174 buffer_FragColorbgra8[x*4+2] = d[2];
4175 buffer_FragColorbgra8[x*4+3] = d[3];
4180 for (x = startx;x < endx;x++)
4183 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4184 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4185 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4186 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4187 if (attenuation < 0.01f)
4189 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4191 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4192 if (attenuation < 0.01f)
4196 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4197 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4198 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4199 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4200 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4202 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4203 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4204 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4205 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4207 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4209 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4210 attenuation *= (1.0f / 255.0f);
4211 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4212 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4213 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4214 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4218 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4219 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4220 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4221 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4223 buffer_FragColorbgra8[x*4+0] = d[0];
4224 buffer_FragColorbgra8[x*4+1] = d[1];
4225 buffer_FragColorbgra8[x*4+2] = d[2];
4226 buffer_FragColorbgra8[x*4+3] = d[3];
4229 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4235 void DPSOFTRAST_VertexShader_Refraction(void)
4237 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4240 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4243 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4244 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4245 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4246 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4247 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4252 void DPSOFTRAST_VertexShader_Water(void)
4254 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4258 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4261 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4262 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4263 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4264 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4265 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4270 void DPSOFTRAST_VertexShader_ShowDepth(void)
4272 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4275 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4278 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4279 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4280 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4281 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4282 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4287 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4289 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4292 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4295 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4296 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4297 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4298 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4299 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4304 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4306 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4309 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4312 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4313 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4314 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4315 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4316 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4321 typedef struct DPSOFTRAST_ShaderModeInfo_s
4324 void (*Vertex)(void);
4325 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4326 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4327 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4329 DPSOFTRAST_ShaderModeInfo;
4331 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4333 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4334 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4335 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4336 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4337 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4338 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4339 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {~0}, {~0}},
4340 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4341 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4342 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4343 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4344 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {~0}},
4345 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4346 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4347 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4348 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4351 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4358 // unsigned int *colorpixel;
4359 unsigned int *depthpixel;
4365 DPSOFTRAST_State_Triangle *triangle;
4366 DPSOFTRAST_State_Span *span;
4367 unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4368 for (i = 0; i < thread->numspans; i++)
4370 span = &thread->spans[i];
4371 triangle = &thread->triangles[span->triangle];
4372 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4374 wslope = triangle->w[0];
4375 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4376 depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4377 depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4378 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4379 startx = span->startx;
4381 switch(thread->fb_depthfunc)
4384 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4385 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4386 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4387 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4388 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4389 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4390 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4392 //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4393 //for (x = startx;x < endx;x++)
4394 // colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4395 // if there is no color buffer, skip pixel shader
4396 while (startx < endx && !pixelmask[startx])
4398 while (endx > startx && !pixelmask[endx-1])
4401 continue; // no pixels to fill
4402 span->pixelmask = pixelmask;
4403 span->startx = startx;
4405 // run pixel shader if appropriate
4406 // do this before running depthmask code, to allow the pixelshader
4407 // to clear pixelmask values for alpha testing
4408 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4409 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4410 if (thread->depthmask)
4411 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4417 // no depth testing means we're just dealing with color...
4418 // if there is no color buffer, skip pixel shader
4419 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4421 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4422 span->pixelmask = pixelmask;
4423 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4427 thread->numspans = 0;
4430 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4432 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4435 int cullface = thread->cullface;
4436 int minx, maxx, miny, maxy;
4437 int miny1, maxy1, miny2, maxy2;
4438 __m128i fbmin, fbmax;
4439 __m128 viewportcenter, viewportscale;
4440 int firstvertex = command->firstvertex;
4441 int numvertices = command->numvertices;
4442 int numtriangles = command->numtriangles;
4443 const int *element3i = command->element3i;
4444 const unsigned short *element3s = command->element3s;
4445 int clipped = command->clipped;
4452 int starty, endy, bandy;
4456 __m128 triangleedge1, triangleedge2, trianglenormal;
4459 DPSOFTRAST_State_Triangle *triangle;
4460 DPSOFTRAST_Texture *texture;
4461 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4462 miny = thread->fb_scissor[1];
4463 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4464 miny1 = bound(miny, thread->miny1, maxy);
4465 maxy1 = bound(miny, thread->maxy1, maxy);
4466 miny2 = bound(miny, thread->miny2, maxy);
4467 maxy2 = bound(miny, thread->maxy2, maxy);
4468 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4470 if (!ATOMIC_DECREMENT(command->refcount))
4472 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4473 MM_FREE(command->arrays);
4477 minx = thread->fb_scissor[0];
4478 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4479 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4480 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4481 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4482 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4483 screen[3] = _mm_setzero_ps();
4484 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4485 for (i = 0;i < numtriangles;i++)
4487 const float *screencoord4f = command->arrays;
4488 const float *arrays = screencoord4f + numvertices*4;
4490 // generate the 3 edges of this triangle
4491 // generate spans for the triangle - switch based on left split or right split classification of triangle
4494 e[0] = element3s[i*3+0] - firstvertex;
4495 e[1] = element3s[i*3+1] - firstvertex;
4496 e[2] = element3s[i*3+2] - firstvertex;
4500 e[0] = element3i[i*3+0] - firstvertex;
4501 e[1] = element3i[i*3+1] - firstvertex;
4502 e[2] = element3i[i*3+2] - firstvertex;
4511 #define SKIPBACKFACE \
4512 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4513 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4514 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4515 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4516 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4520 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4524 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4529 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4530 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4532 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4533 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4535 #define CLIPPEDVERTEXCOPY(k,p1) \
4536 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4538 #define GENATTRIBCOPY(attrib, p1) \
4539 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4540 #define GENATTRIBLERP(attrib, p1, p2) \
4542 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4543 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4545 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4549 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4550 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4551 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4552 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4553 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4554 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4555 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4561 // calculate distance from nearplane
4562 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4563 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4564 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4565 if (clipdist[0] >= 0.0f)
4567 if (clipdist[1] >= 0.0f)
4569 if (clipdist[2] >= 0.0f)
4572 // triangle is entirely in front of nearplane
4573 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4580 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4588 if (clipdist[2] >= 0.0f)
4590 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4597 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4604 else if (clipdist[1] >= 0.0f)
4606 if (clipdist[2] >= 0.0f)
4608 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4615 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4621 else if (clipdist[2] >= 0.0f)
4623 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4628 else continue; // triangle is entirely behind nearplane
4631 // calculate integer y coords for triangle points
4632 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4633 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4634 screenmin = _mm_min_epi16(screeni, screenir),
4635 screenmax = _mm_max_epi16(screeni, screenir);
4636 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4637 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4638 screenmin = _mm_max_epi16(screenmin, fbmin);
4639 screenmax = _mm_min_epi16(screenmax, fbmax);
4640 // skip offscreen triangles
4641 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4643 starty = _mm_extract_epi16(screenmin, 1);
4644 endy = _mm_extract_epi16(screenmax, 1)+1;
4645 if (starty >= maxy1 && endy <= miny2)
4647 screeny = _mm_srai_epi32(screeni, 16);
4650 triangle = &thread->triangles[thread->numtriangles];
4652 // calculate attribute plans for triangle data...
4653 // okay, this triangle is going to produce spans, we'd better project
4654 // the interpolants now (this is what gives perspective texturing),
4655 // this consists of simply multiplying all arrays by the W coord
4656 // (which is basically 1/Z), which will be undone per-pixel
4657 // (multiplying by Z again) to get the perspective-correct array
4660 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4661 __m128 mipedgescale, mipdensity;
4662 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4663 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4664 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4665 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4666 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4667 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4668 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4669 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4670 attribedge1 = _mm_sub_ss(w0, w1);
4671 attribedge2 = _mm_sub_ss(w2, w1);
4672 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4673 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4674 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4675 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4676 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4677 _mm_store_ss(&triangle->w[0], attribxslope);
4678 _mm_store_ss(&triangle->w[1], attribyslope);
4679 _mm_store_ss(&triangle->w[2], attriborigin);
4680 mipedgescale = _mm_setzero_ps();
4681 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4683 __m128 attrib0, attrib1, attrib2;
4684 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4685 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4687 arrays += numvertices*4;
4688 GENATTRIBS(attrib0, attrib1, attrib2);
4689 attriborigin = _mm_mul_ps(attrib1, w1);
4690 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4691 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4692 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4693 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4694 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4695 _mm_stream_ps(triangle->attribs[k][0], attribxslope);
4696 _mm_stream_ps(triangle->attribs[k][1], attribyslope);
4697 _mm_stream_ps(triangle->attribs[k][2], attriborigin);
4698 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4700 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4701 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4702 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4703 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4707 memset(triangle->mip, 0, sizeof(triangle->mip));
4708 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4710 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4711 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4713 texture = thread->texbound[texunit];
4714 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4716 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4717 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4718 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4719 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4720 // this will be multiplied in the texturing routine by the texture resolution
4721 y = _mm_cvtss_si32(mipdensity);
4724 y = (int)(log((float)y)*0.5f/M_LN2);
4725 if (y > texture->mipmaps - 1)
4726 y = texture->mipmaps - 1;
4727 triangle->mip[texunit] = y;
4733 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4736 __m128 xcoords, xslope;
4737 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4738 int yccmask = _mm_movemask_epi8(ycc);
4739 int edge0p, edge0n, edge1p, edge1n;
4746 case 0xFFFF: /*0000*/ y = endy; continue;
4747 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4748 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4749 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4750 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4751 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4752 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4753 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4754 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4755 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4756 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4757 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4758 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4759 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4760 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4761 case 0x0000: /*1111*/ y++; continue;
4769 case 0xFFFF: /*000*/ y = endy; continue;
4770 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4771 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4772 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4773 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4774 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4775 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4776 case 0x0000: /*111*/ y++; continue;
4779 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
4780 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4781 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4782 nexty = _mm_extract_epi16(ycc, 0);
4783 if (nexty >= bandy) nexty = bandy-1;
4784 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
4785 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4786 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
4787 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4788 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4789 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
4791 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
4792 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
4794 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4796 int startx, endx, offset;
4797 startx = _mm_cvtss_si32(xcoords);
4798 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4801 if (startx < 0) startx = 0;
4802 startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
4804 if (endx > maxx) endx = maxx;
4805 if (startx >= endx) continue;
4806 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
4808 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4809 span->triangle = thread->numtriangles;
4812 span->startx = max(minx - offset, 0);
4813 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
4814 if (span->startx >= span->endx)
4816 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4817 DPSOFTRAST_Draw_ProcessSpans(thread);
4822 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
4824 DPSOFTRAST_Draw_ProcessSpans(thread);
4825 thread->numtriangles = 0;
4829 if (!ATOMIC_DECREMENT(command->refcount))
4831 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4832 MM_FREE(command->arrays);
4835 if (thread->numspans > 0 || thread->numtriangles > 0)
4837 DPSOFTRAST_Draw_ProcessSpans(thread);
4838 thread->numtriangles = 0;
4843 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4847 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
4848 int datasize = 2*numvertices*sizeof(float[4]);
4849 DPSOFTRAST_Command_Draw *command;
4850 unsigned char *data;
4851 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4853 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4854 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4856 datasize += numvertices*sizeof(float[4]);
4859 datasize += numtriangles*sizeof(unsigned short[3]);
4861 datasize += numtriangles*sizeof(int[3]);
4862 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
4863 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
4865 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
4866 data = (unsigned char *)MM_CALLOC(datasize, 1);
4870 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
4871 data = (unsigned char *)command + commandsize;
4873 command->firstvertex = firstvertex;
4874 command->numvertices = numvertices;
4875 command->numtriangles = numtriangles;
4876 command->arrays = (float *)data;
4877 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
4878 dpsoftrast.firstvertex = firstvertex;
4879 dpsoftrast.numvertices = numvertices;
4880 dpsoftrast.screencoord4f = (float *)data;
4881 data += numvertices*sizeof(float[4]);
4882 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
4883 data += numvertices*sizeof(float[4]);
4884 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4886 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4887 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4889 dpsoftrast.post_array4f[j] = (float *)data;
4890 data += numvertices*sizeof(float[4]);
4892 command->element3i = NULL;
4893 command->element3s = NULL;
4896 command->element3s = (unsigned short *)data;
4897 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
4901 command->element3i = (int *)data;
4902 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
4907 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4909 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
4910 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4911 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
4912 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
4913 if (command->starty >= command->endy)
4915 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4916 MM_FREE(command->arrays);
4917 DPSOFTRAST_UndoCommand(command->commandsize);
4920 command->clipped = dpsoftrast.drawclipped;
4921 command->refcount = dpsoftrast.numthreads;
4923 if (dpsoftrast.usethreads)
4926 DPSOFTRAST_Draw_SyncCommands();
4927 for (i = 0; i < dpsoftrast.numthreads; i++)
4929 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4930 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
4931 Thread_CondSignal(thread->drawcond);
4936 DPSOFTRAST_Draw_FlushThreads();
4940 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
4942 int commandoffset = thread->commandoffset;
4943 while (commandoffset != endoffset)
4945 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4946 switch (command->opcode)
4948 #define INTERPCOMMAND(name) \
4949 case DPSOFTRAST_OPCODE_##name : \
4950 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4951 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
4952 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4953 commandoffset = 0; \
4955 INTERPCOMMAND(Viewport)
4956 INTERPCOMMAND(ClearColor)
4957 INTERPCOMMAND(ClearDepth)
4958 INTERPCOMMAND(ColorMask)
4959 INTERPCOMMAND(DepthTest)
4960 INTERPCOMMAND(ScissorTest)
4961 INTERPCOMMAND(Scissor)
4962 INTERPCOMMAND(BlendFunc)
4963 INTERPCOMMAND(BlendSubtract)
4964 INTERPCOMMAND(DepthMask)
4965 INTERPCOMMAND(DepthFunc)
4966 INTERPCOMMAND(DepthRange)
4967 INTERPCOMMAND(PolygonOffset)
4968 INTERPCOMMAND(CullFace)
4969 INTERPCOMMAND(AlphaTest)
4970 INTERPCOMMAND(AlphaFunc)
4971 INTERPCOMMAND(SetTexture)
4972 INTERPCOMMAND(SetShader)
4973 INTERPCOMMAND(Uniform4f)
4974 INTERPCOMMAND(UniformMatrix4f)
4975 INTERPCOMMAND(Uniform1i)
4977 case DPSOFTRAST_OPCODE_Draw:
4978 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
4979 commandoffset += command->commandsize;
4980 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
4982 thread->commandoffset = commandoffset;
4985 case DPSOFTRAST_OPCODE_Reset:
4990 thread->commandoffset = commandoffset;
4993 static int DPSOFTRAST_Draw_Thread(void *data)
4995 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4996 while(thread->index >= 0)
4998 if (thread->commandoffset != dpsoftrast.drawcommand)
5000 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5004 Thread_LockMutex(thread->drawmutex);
5005 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5007 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5008 thread->starving = true;
5009 Thread_CondWait(thread->drawcond, thread->drawmutex);
5010 thread->starving = false;
5012 Thread_UnlockMutex(thread->drawmutex);
5018 static void DPSOFTRAST_Draw_FlushThreads(void)
5020 DPSOFTRAST_State_Thread *thread;
5022 DPSOFTRAST_Draw_SyncCommands();
5023 if (dpsoftrast.usethreads)
5025 for (i = 0; i < dpsoftrast.numthreads; i++)
5027 thread = &dpsoftrast.threads[i];
5028 if (thread->commandoffset != dpsoftrast.drawcommand)
5030 Thread_LockMutex(thread->drawmutex);
5031 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5032 Thread_CondSignal(thread->drawcond);
5033 Thread_UnlockMutex(thread->drawmutex);
5036 for (i = 0; i < dpsoftrast.numthreads; i++)
5038 thread = &dpsoftrast.threads[i];
5039 if (thread->commandoffset != dpsoftrast.drawcommand)
5041 Thread_LockMutex(thread->drawmutex);
5042 if (thread->commandoffset != dpsoftrast.drawcommand)
5044 thread->waiting = true;
5045 Thread_CondWait(thread->waitcond, thread->drawmutex);
5046 thread->waiting = false;
5048 Thread_UnlockMutex(thread->drawmutex);
5054 for (i = 0; i < dpsoftrast.numthreads; i++)
5056 thread = &dpsoftrast.threads[i];
5057 if (thread->commandoffset != dpsoftrast.drawcommand)
5058 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5061 dpsoftrast.commandpool.usedcommands = 0;
5064 void DPSOFTRAST_Flush(void)
5066 DPSOFTRAST_Draw_FlushThreads();
5069 void DPSOFTRAST_Finish(void)
5074 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5084 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5085 dpsoftrast.bigendian = u.b[3];
5086 dpsoftrast.fb_width = width;
5087 dpsoftrast.fb_height = height;
5088 dpsoftrast.fb_depthpixels = depthpixels;
5089 dpsoftrast.fb_colorpixels[0] = colorpixels;
5090 dpsoftrast.fb_colorpixels[1] = NULL;
5091 dpsoftrast.fb_colorpixels[1] = NULL;
5092 dpsoftrast.fb_colorpixels[1] = NULL;
5093 dpsoftrast.viewport[0] = 0;
5094 dpsoftrast.viewport[1] = 0;
5095 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5096 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5097 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5098 dpsoftrast.texture_firstfree = 1;
5099 dpsoftrast.texture_end = 1;
5100 dpsoftrast.texture_max = 0;
5101 dpsoftrast.color[0] = 1;
5102 dpsoftrast.color[1] = 1;
5103 dpsoftrast.color[2] = 1;
5104 dpsoftrast.color[3] = 1;
5105 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5106 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5107 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5108 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5109 for (i = 0; i < dpsoftrast.numthreads; i++)
5111 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5113 thread->cullface = GL_BACK;
5114 thread->colormask[1] = 1;
5115 thread->colormask[2] = 1;
5116 thread->colormask[3] = 1;
5117 thread->blendfunc[0] = GL_ONE;
5118 thread->blendfunc[1] = GL_ZERO;
5119 thread->depthmask = true;
5120 thread->depthtest = true;
5121 thread->depthfunc = GL_LEQUAL;
5122 thread->scissortest = false;
5123 thread->alphatest = false;
5124 thread->alphafunc = GL_GREATER;
5125 thread->alphavalue = 0.5f;
5126 thread->viewport[0] = 0;
5127 thread->viewport[1] = 0;
5128 thread->viewport[2] = dpsoftrast.fb_width;
5129 thread->viewport[3] = dpsoftrast.fb_height;
5130 thread->scissor[0] = 0;
5131 thread->scissor[1] = 0;
5132 thread->scissor[2] = dpsoftrast.fb_width;
5133 thread->scissor[3] = dpsoftrast.fb_height;
5134 thread->depthrange[0] = 0;
5135 thread->depthrange[1] = 1;
5136 thread->polygonoffset[0] = 0;
5137 thread->polygonoffset[1] = 0;
5139 if (dpsoftrast.interlace)
5141 thread->miny1 = (i*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5142 thread->maxy1 = ((i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5143 thread->miny2 = ((dpsoftrast.numthreads+i)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5144 thread->maxy2 = ((dpsoftrast.numthreads+i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5148 thread->miny1 = thread->miny2 = (i*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5149 thread->maxy1 = thread->maxy2 = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5152 thread->numspans = 0;
5153 thread->numtriangles = 0;
5154 thread->commandoffset = 0;
5155 thread->waiting = false;
5156 thread->starving = false;
5158 thread->validate = -1;
5159 DPSOFTRAST_Validate(thread, -1);
5161 if (dpsoftrast.usethreads)
5163 thread->waitcond = Thread_CreateCond();
5164 thread->drawcond = Thread_CreateCond();
5165 thread->drawmutex = Thread_CreateMutex();
5166 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5172 void DPSOFTRAST_Shutdown(void)
5175 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5177 DPSOFTRAST_State_Thread *thread;
5178 for (i = 0; i < dpsoftrast.numthreads; i++)
5180 thread = &dpsoftrast.threads[i];
5181 Thread_LockMutex(thread->drawmutex);
5183 Thread_CondSignal(thread->drawcond);
5184 Thread_UnlockMutex(thread->drawmutex);
5185 Thread_WaitThread(thread->thread, 0);
5186 Thread_DestroyCond(thread->waitcond);
5187 Thread_DestroyCond(thread->drawcond);
5188 Thread_DestroyMutex(thread->drawmutex);
5191 for (i = 0;i < dpsoftrast.texture_end;i++)
5192 if (dpsoftrast.texture[i].bytes)
5193 MM_FREE(dpsoftrast.texture[i].bytes);
5194 if (dpsoftrast.texture)
5195 free(dpsoftrast.texture);
5196 if (dpsoftrast.threads)
5197 MM_FREE(dpsoftrast.threads);
5198 memset(&dpsoftrast, 0, sizeof(dpsoftrast));