3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 typedef qboolean bool;
14 #define ATOMIC_SIZE 32
17 #if defined(__APPLE__)
18 #include <libkern/OSAtomic.h>
19 #define ALIGN(var) var __attribute__((__aligned__(16)))
20 #define ATOMIC(var) var __attribute__((__aligned__(32)))
21 #define MEMORY_BARRIER (_mm_sfence())
22 #define ATOMIC_COUNTER volatile int32_t
23 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
24 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
25 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
26 #elif defined(__GNUC__)
27 #define ALIGN(var) var __attribute__((__aligned__(16)))
28 #define ATOMIC(var) var __attribute__((__aligned__(32)))
29 #define MEMORY_BARRIER (_mm_sfence())
30 //(__sync_synchronize())
31 #define ATOMIC_COUNTER volatile int
32 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
33 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
34 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
35 #elif defined(_MSC_VER)
36 #define ALIGN(var) __declspec(align(16)) var
37 #define ATOMIC(var) __declspec(align(32)) var
38 #define MEMORY_BARRIER (_mm_sfence())
40 #define ATOMIC_COUNTER volatile LONG
41 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
42 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
43 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
48 #define ALIGN(var) var
51 #define ATOMIC(var) var
53 #ifndef MEMORY_BARRIER
54 #define MEMORY_BARRIER ((void)0)
56 #ifndef ATOMIC_COUNTER
57 #define ATOMIC_COUNTER int
59 #ifndef ATOMIC_INCREMENT
60 #define ATOMIC_INCREMENT(counter) (++(counter))
62 #ifndef ATOMIC_DECREMENT
63 #define ATOMIC_DECREMENT(counter) (--(counter))
66 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
70 #include <emmintrin.h>
72 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
74 static void *MM_CALLOC(size_t nmemb, size_t size)
76 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
77 if (ptr != NULL) memset(ptr, 0, nmemb*size);
81 #define MM_FREE _mm_free
83 #define MM_MALLOC(size) malloc(size)
84 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
88 typedef enum DPSOFTRAST_ARRAY_e
90 DPSOFTRAST_ARRAY_POSITION,
91 DPSOFTRAST_ARRAY_COLOR,
92 DPSOFTRAST_ARRAY_TEXCOORD0,
93 DPSOFTRAST_ARRAY_TEXCOORD1,
94 DPSOFTRAST_ARRAY_TEXCOORD2,
95 DPSOFTRAST_ARRAY_TEXCOORD3,
96 DPSOFTRAST_ARRAY_TEXCOORD4,
97 DPSOFTRAST_ARRAY_TEXCOORD5,
98 DPSOFTRAST_ARRAY_TEXCOORD6,
99 DPSOFTRAST_ARRAY_TEXCOORD7,
100 DPSOFTRAST_ARRAY_TOTAL
104 typedef struct DPSOFTRAST_Texture_s
111 DPSOFTRAST_TEXTURE_FILTER filter;
114 ATOMIC_COUNTER binds;
115 unsigned char *bytes;
116 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
120 #define COMMAND_SIZE ALIGN_SIZE
121 #define COMMAND_ALIGN(var) ALIGN(var)
123 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
125 unsigned char opcode;
126 unsigned short commandsize;
130 enum { DPSOFTRAST_OPCODE_Reset = 0 };
132 #define DEFCOMMAND(opcodeval, name, fields) \
133 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
134 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
136 unsigned char opcode; \
137 unsigned short commandsize; \
139 } DPSOFTRAST_Command_##name );
141 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
142 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
144 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
148 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
150 DPSOFTRAST_State_Command_Pool);
152 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
154 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
156 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
158 DPSOFTRAST_State_Triangle);
160 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
161 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
162 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
163 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
164 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
166 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
167 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
168 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
169 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
170 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
171 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
172 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
173 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
174 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
177 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
179 typedef ALIGN(struct DPSOFTRAST_State_Span_s
181 int triangle; // triangle this span was generated by
182 int x; // framebuffer x coord
183 int y; // framebuffer y coord
184 int startx; // usable range (according to pixelmask)
185 int endx; // usable range (according to pixelmask)
186 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
188 DPSOFTRAST_State_Span);
190 #define DPSOFTRAST_DRAW_MAXSPANS 1024
191 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
193 #define DPSOFTRAST_VALIDATE_FB 1
194 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
195 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
196 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
198 typedef enum DPSOFTRAST_BLENDMODE_e
200 DPSOFTRAST_BLENDMODE_OPAQUE,
201 DPSOFTRAST_BLENDMODE_ALPHA,
202 DPSOFTRAST_BLENDMODE_ADDALPHA,
203 DPSOFTRAST_BLENDMODE_ADD,
204 DPSOFTRAST_BLENDMODE_INVMOD,
205 DPSOFTRAST_BLENDMODE_MUL,
206 DPSOFTRAST_BLENDMODE_MUL2,
207 DPSOFTRAST_BLENDMODE_SUBALPHA,
208 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
209 DPSOFTRAST_BLENDMODE_INVADD,
210 DPSOFTRAST_BLENDMODE_TOTAL
212 DPSOFTRAST_BLENDMODE;
214 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
233 float polygonoffset[2];
236 int shader_permutation;
238 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
240 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
241 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
243 // DPSOFTRAST_VALIDATE_ flags
246 // derived values (DPSOFTRAST_VALIDATE_FB)
249 ALIGN(float fb_viewportcenter[4]);
250 ALIGN(float fb_viewportscale[4]);
252 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
255 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
264 ATOMIC(volatile int commandoffset);
266 volatile bool waiting;
267 volatile bool starving;
274 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
275 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
277 DPSOFTRAST_State_Thread);
279 typedef ATOMIC(struct DPSOFTRAST_State_s
283 unsigned int *fb_depthpixels;
284 unsigned int *fb_colorpixels[4];
287 ALIGN(float fb_viewportcenter[4]);
288 ALIGN(float fb_viewportscale[4]);
291 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
292 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
294 const float *pointer_vertex3f;
295 const float *pointer_color4f;
296 const unsigned char *pointer_color4ub;
297 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
300 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
301 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
302 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
306 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
307 float *screencoord4f;
313 int shader_permutation;
317 int texture_firstfree;
318 DPSOFTRAST_Texture *texture;
323 const char *errorstring;
328 DPSOFTRAST_State_Thread *threads;
330 ATOMIC(volatile int drawcommand);
332 DPSOFTRAST_State_Command_Pool commandpool;
336 DPSOFTRAST_State dpsoftrast;
338 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
339 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
340 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
341 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
342 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
344 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
346 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
347 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
348 fb_viewportcenter[3] = 0.5f;
349 fb_viewportcenter[0] = 0.0f;
350 fb_viewportscale[1] = 0.5f * viewport[2];
351 fb_viewportscale[2] = -0.5f * viewport[3];
352 fb_viewportscale[3] = 0.5f;
353 fb_viewportscale[0] = 1.0f;
356 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
358 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
359 // and viewport projection values
362 x1 = thread->scissor[0];
363 x2 = thread->scissor[0] + thread->scissor[2];
364 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
365 y2 = dpsoftrast.fb_height - thread->scissor[1];
366 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
368 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
370 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
371 thread->fb_scissor[0] = x1;
372 thread->fb_scissor[1] = y1;
373 thread->fb_scissor[2] = x2 - x1;
374 thread->fb_scissor[3] = y2 - y1;
376 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
379 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
381 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
384 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
386 if (thread->blendsubtract)
388 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
390 #define BLENDFUNC(sfactor, dfactor, blendmode) \
391 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
392 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
393 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
398 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
400 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
401 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
402 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
403 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
404 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
405 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
406 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
407 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
408 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
409 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
410 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
415 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
417 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
419 mask &= thread->validate;
422 if (mask & DPSOFTRAST_VALIDATE_FB)
424 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
425 DPSOFTRAST_RecalcFB(thread);
427 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
429 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
430 DPSOFTRAST_RecalcDepthFunc(thread);
432 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
434 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
435 DPSOFTRAST_RecalcBlendFunc(thread);
439 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
441 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
442 return &dpsoftrast.texture[index];
446 static void DPSOFTRAST_Texture_Grow(void)
448 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
449 DPSOFTRAST_State_Thread *thread;
453 // expand texture array as needed
454 if (dpsoftrast.texture_max < 1024)
455 dpsoftrast.texture_max = 1024;
457 dpsoftrast.texture_max *= 2;
458 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
459 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
460 if (dpsoftrast.texbound[i])
461 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
462 for (j = 0; j < dpsoftrast.numthreads; j++)
464 thread = &dpsoftrast.threads[j];
465 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
466 if (thread->texbound[i])
467 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
471 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
480 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
481 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
482 DPSOFTRAST_Texture *texture;
483 if (width*height*depth < 1)
485 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
488 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
490 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
495 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
496 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
497 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
499 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
500 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
502 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
507 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
510 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
512 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
517 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
519 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
522 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
524 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
527 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
529 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
532 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
534 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
537 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
539 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
542 // find first empty slot in texture array
543 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
544 if (!dpsoftrast.texture[texnum].bytes)
546 dpsoftrast.texture_firstfree = texnum + 1;
547 if (dpsoftrast.texture_max <= texnum)
548 DPSOFTRAST_Texture_Grow();
549 if (dpsoftrast.texture_end <= texnum)
550 dpsoftrast.texture_end = texnum + 1;
551 texture = &dpsoftrast.texture[texnum];
552 memset(texture, 0, sizeof(*texture));
553 texture->flags = flags;
554 texture->width = width;
555 texture->height = height;
556 texture->depth = depth;
557 texture->sides = sides;
569 s = w * h * d * sides * 4;
570 texture->mipmap[mipmaps][0] = size;
571 texture->mipmap[mipmaps][1] = s;
572 texture->mipmap[mipmaps][2] = w;
573 texture->mipmap[mipmaps][3] = h;
574 texture->mipmap[mipmaps][4] = d;
577 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
583 texture->mipmaps = mipmaps;
584 texture->size = size;
586 // allocate the pixels now
587 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
591 void DPSOFTRAST_Texture_Free(int index)
593 DPSOFTRAST_Texture *texture;
594 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
598 MM_FREE(texture->bytes);
599 texture->bytes = NULL;
600 memset(texture, 0, sizeof(*texture));
601 // adjust the free range and used range
602 if (dpsoftrast.texture_firstfree > index)
603 dpsoftrast.texture_firstfree = index;
604 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
605 dpsoftrast.texture_end--;
607 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
609 int i, x, y, z, w, layer0, layer1, row0, row1;
610 unsigned char *o, *i0, *i1, *i2, *i3;
611 DPSOFTRAST_Texture *texture;
612 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
613 if (texture->mipmaps <= 1)
615 for (i = 1;i < texture->mipmaps;i++)
617 for (z = 0;z < texture->mipmap[i][4];z++)
621 if (layer1 >= texture->mipmap[i-1][4])
622 layer1 = texture->mipmap[i-1][4]-1;
623 for (y = 0;y < texture->mipmap[i][3];y++)
627 if (row1 >= texture->mipmap[i-1][3])
628 row1 = texture->mipmap[i-1][3]-1;
629 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
630 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
631 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
632 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
633 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
634 w = texture->mipmap[i][2];
637 if (texture->mipmap[i-1][2] > 1)
639 // average 3D texture
640 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
642 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
643 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
644 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
645 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
650 // average 3D mipmap with parent width == 1
651 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
653 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
654 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
655 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
656 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
662 if (texture->mipmap[i-1][2] > 1)
664 // average 2D texture (common case)
665 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
667 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
668 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
669 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
670 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
675 // 2D texture with parent width == 1
676 o[0] = (i0[0] + i1[0] + 1) >> 1;
677 o[1] = (i0[1] + i1[1] + 1) >> 1;
678 o[2] = (i0[2] + i1[2] + 1) >> 1;
679 o[3] = (i0[3] + i1[3] + 1) >> 1;
686 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
688 DPSOFTRAST_Texture *texture;
690 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
693 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
694 while (blockheight > 0)
696 memcpy(dst, pixels, blockwidth * 4);
697 pixels += blockwidth * 4;
698 dst += texture->mipmap[0][2] * 4;
701 DPSOFTRAST_Texture_CalculateMipmaps(index);
703 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
705 DPSOFTRAST_Texture *texture;
706 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
709 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
710 DPSOFTRAST_Texture_CalculateMipmaps(index);
712 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
714 DPSOFTRAST_Texture *texture;
715 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
716 return texture->mipmap[mip][2];
718 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
720 DPSOFTRAST_Texture *texture;
721 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
722 return texture->mipmap[mip][3];
724 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
726 DPSOFTRAST_Texture *texture;
727 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
728 return texture->mipmap[mip][4];
730 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
732 DPSOFTRAST_Texture *texture;
733 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
736 return texture->bytes + texture->mipmap[mip][0];
738 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
740 DPSOFTRAST_Texture *texture;
741 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
742 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
744 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
749 texture->filter = filter;
752 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
754 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
755 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
756 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
758 dpsoftrast.fb_width = width;
759 dpsoftrast.fb_height = height;
760 dpsoftrast.fb_depthpixels = depthpixels;
761 dpsoftrast.fb_colorpixels[0] = colorpixels0;
762 dpsoftrast.fb_colorpixels[1] = colorpixels1;
763 dpsoftrast.fb_colorpixels[2] = colorpixels2;
764 dpsoftrast.fb_colorpixels[3] = colorpixels3;
767 static void DPSOFTRAST_Draw_FlushThreads(void);
769 static void DPSOFTRAST_Draw_SyncCommands(void)
771 if(dpsoftrast.usethreads) MEMORY_BARRIER;
772 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
775 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
777 DPSOFTRAST_State_Thread *thread;
779 int freecommand = dpsoftrast.commandpool.freecommand;
780 int usedcommands = dpsoftrast.commandpool.usedcommands;
781 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
783 DPSOFTRAST_Draw_SyncCommands();
789 for (i = 0; i < dpsoftrast.numthreads; i++)
791 thread = &dpsoftrast.threads[i];
792 commandoffset = freecommand - thread->commandoffset;
793 if (commandoffset < 0)
794 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
795 if (commandoffset > usedcommands)
798 usedcommands = commandoffset;
801 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
803 thread = &dpsoftrast.threads[waitindex];
804 Thread_LockMutex(thread->drawmutex);
805 if (thread->commandoffset != dpsoftrast.drawcommand)
807 thread->waiting = true;
808 if (thread->starving) Thread_CondSignal(thread->drawcond);
809 Thread_CondWait(thread->waitcond, thread->drawmutex);
810 thread->waiting = false;
812 Thread_UnlockMutex(thread->drawmutex);
814 dpsoftrast.commandpool.usedcommands = usedcommands;
817 #define DPSOFTRAST_ALIGNCOMMAND(size) \
818 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
819 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
820 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
822 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
824 DPSOFTRAST_Command *command;
825 int freecommand = dpsoftrast.commandpool.freecommand;
826 int usedcommands = dpsoftrast.commandpool.usedcommands;
827 int extra = sizeof(DPSOFTRAST_Command);
828 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
829 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
830 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
832 if (dpsoftrast.usethreads)
833 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
835 DPSOFTRAST_Draw_FlushThreads();
836 freecommand = dpsoftrast.commandpool.freecommand;
837 usedcommands = dpsoftrast.commandpool.usedcommands;
839 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
841 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
842 command->opcode = DPSOFTRAST_OPCODE_Reset;
843 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
846 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
847 command->opcode = opcode;
848 command->commandsize = size;
850 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
852 dpsoftrast.commandpool.freecommand = freecommand;
853 dpsoftrast.commandpool.usedcommands = usedcommands + size;
857 static void DPSOFTRAST_UndoCommand(int size)
859 int freecommand = dpsoftrast.commandpool.freecommand;
860 int usedcommands = dpsoftrast.commandpool.usedcommands;
863 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
864 usedcommands -= size;
865 dpsoftrast.commandpool.freecommand = freecommand;
866 dpsoftrast.commandpool.usedcommands = usedcommands;
869 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
870 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
872 thread->viewport[0] = command->x;
873 thread->viewport[1] = command->y;
874 thread->viewport[2] = command->width;
875 thread->viewport[3] = command->height;
876 thread->validate |= DPSOFTRAST_VALIDATE_FB;
878 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
880 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
883 command->width = width;
884 command->height = height;
886 dpsoftrast.viewport[0] = x;
887 dpsoftrast.viewport[1] = y;
888 dpsoftrast.viewport[2] = width;
889 dpsoftrast.viewport[3] = height;
890 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
893 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
894 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
896 int i, x1, y1, x2, y2, w, h, x, y;
897 int miny1 = thread->miny1;
898 int maxy1 = thread->maxy1;
899 int miny2 = thread->miny2;
900 int maxy2 = thread->maxy2;
904 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
905 x1 = thread->fb_scissor[0];
906 y1 = thread->fb_scissor[1];
907 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
908 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
909 if (y1 < miny1) y1 = miny1;
910 if (y2 > maxy2) y2 = maxy2;
915 // FIXME: honor fb_colormask?
916 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
917 for (i = 0;i < 4;i++)
919 if (!dpsoftrast.fb_colorpixels[i])
921 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
924 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
925 for (x = x1;x < x2;x++)
930 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
932 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
939 DEFCOMMAND(3, ClearDepth, float depth;)
940 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
942 int x1, y1, x2, y2, w, h, x, y;
943 int miny1 = thread->miny1;
944 int maxy1 = thread->maxy1;
945 int miny2 = thread->miny2;
946 int maxy2 = thread->maxy2;
950 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
951 x1 = thread->fb_scissor[0];
952 y1 = thread->fb_scissor[1];
953 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
954 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
955 if (y1 < miny1) y1 = miny1;
956 if (y2 > maxy2) y2 = maxy2;
961 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
962 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
965 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
966 for (x = x1;x < x2;x++)
970 void DPSOFTRAST_ClearDepth(float d)
972 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
976 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
977 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
979 thread->colormask[0] = command->r != 0;
980 thread->colormask[1] = command->g != 0;
981 thread->colormask[2] = command->b != 0;
982 thread->colormask[3] = command->a != 0;
983 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
985 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
987 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
994 DEFCOMMAND(5, DepthTest, int enable;)
995 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
997 thread->depthtest = command->enable;
998 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1000 void DPSOFTRAST_DepthTest(int enable)
1002 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1003 command->enable = enable;
1006 DEFCOMMAND(6, ScissorTest, int enable;)
1007 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1009 thread->scissortest = command->enable;
1010 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1012 void DPSOFTRAST_ScissorTest(int enable)
1014 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1015 command->enable = enable;
1018 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1019 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1021 thread->scissor[0] = command->x;
1022 thread->scissor[1] = command->y;
1023 thread->scissor[2] = command->width;
1024 thread->scissor[3] = command->height;
1025 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1027 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1029 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1032 command->width = width;
1033 command->height = height;
1036 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1037 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1039 thread->blendfunc[0] = command->sfactor;
1040 thread->blendfunc[1] = command->dfactor;
1041 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1043 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1045 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1046 command->sfactor = sfactor;
1047 command->dfactor = dfactor;
1050 DEFCOMMAND(9, BlendSubtract, int enable;)
1051 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1053 thread->blendsubtract = command->enable;
1054 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1056 void DPSOFTRAST_BlendSubtract(int enable)
1058 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1059 command->enable = enable;
1062 DEFCOMMAND(10, DepthMask, int enable;)
1063 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1065 thread->depthmask = command->enable;
1067 void DPSOFTRAST_DepthMask(int enable)
1069 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1070 command->enable = enable;
1073 DEFCOMMAND(11, DepthFunc, int func;)
1074 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1076 thread->depthfunc = command->func;
1078 void DPSOFTRAST_DepthFunc(int func)
1080 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1081 command->func = func;
1084 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1085 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1087 thread->depthrange[0] = command->nearval;
1088 thread->depthrange[1] = command->farval;
1090 void DPSOFTRAST_DepthRange(float nearval, float farval)
1092 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1093 command->nearval = nearval;
1094 command->farval = farval;
1097 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1098 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1100 thread->polygonoffset[0] = command->alongnormal;
1101 thread->polygonoffset[1] = command->intoview;
1103 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1105 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1106 command->alongnormal = alongnormal;
1107 command->intoview = intoview;
1110 DEFCOMMAND(14, CullFace, int mode;)
1111 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1113 thread->cullface = command->mode;
1115 void DPSOFTRAST_CullFace(int mode)
1117 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1118 command->mode = mode;
1121 DEFCOMMAND(15, AlphaTest, int enable;)
1122 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1124 thread->alphatest = command->enable;
1126 void DPSOFTRAST_AlphaTest(int enable)
1128 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1129 command->enable = enable;
1132 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1133 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1135 thread->alphafunc = command->func;
1136 thread->alphavalue = command->ref;
1138 void DPSOFTRAST_AlphaFunc(int func, float ref)
1140 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1141 command->func = func;
1145 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1147 dpsoftrast.color[0] = r;
1148 dpsoftrast.color[1] = g;
1149 dpsoftrast.color[2] = b;
1150 dpsoftrast.color[3] = a;
1153 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1155 int outstride = blockwidth * 4;
1156 int instride = dpsoftrast.fb_width * 4;
1159 int bx2 = blockx + blockwidth;
1160 int by2 = blocky + blockheight;
1164 unsigned char *inpixels;
1168 if (bx1 < 0) bx1 = 0;
1169 if (by1 < 0) by1 = 0;
1170 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1171 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1173 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1174 if (dpsoftrast.bigendian)
1176 for (y = by1;y < by2;y++)
1178 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1179 o = (unsigned char *)outpixels + (y - by1) * outstride;
1180 for (x = bx1;x < bx2;x++)
1193 for (y = by1;y < by2;y++)
1195 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1196 o = (unsigned char *)outpixels + (y - by1) * outstride;
1202 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1206 int tx2 = tx + width;
1207 int ty2 = ty + height;
1210 int sx2 = sx + width;
1211 int sy2 = sy + height;
1221 unsigned int *spixels;
1222 unsigned int *tpixels;
1223 DPSOFTRAST_Texture *texture;
1224 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1225 if (mip < 0 || mip >= texture->mipmaps) return;
1227 spixels = dpsoftrast.fb_colorpixels[0];
1228 swidth = dpsoftrast.fb_width;
1229 sheight = dpsoftrast.fb_height;
1230 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1231 twidth = texture->mipmap[mip][2];
1232 theight = texture->mipmap[mip][3];
1233 if (tx1 < 0) tx1 = 0;
1234 if (ty1 < 0) ty1 = 0;
1235 if (tx2 > twidth) tx2 = twidth;
1236 if (ty2 > theight) ty2 = theight;
1237 if (sx1 < 0) sx1 = 0;
1238 if (sy1 < 0) sy1 = 0;
1239 if (sx2 > swidth) sx2 = swidth;
1240 if (sy2 > sheight) sy2 = sheight;
1245 if (tw > sw) tw = sw;
1246 if (th > sh) th = sh;
1247 if (tw < 1 || th < 1)
1249 sy1 = sheight - 1 - sy1;
1250 for (y = 0;y < th;y++)
1251 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1252 if (texture->mipmaps > 1)
1253 DPSOFTRAST_Texture_CalculateMipmaps(index);
1256 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1257 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1259 if (thread->texbound[command->unitnum])
1260 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1261 thread->texbound[command->unitnum] = command->texture;
1263 void DPSOFTRAST_SetTexture(int unitnum, int index)
1265 DPSOFTRAST_Command_SetTexture *command;
1266 DPSOFTRAST_Texture *texture;
1267 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1269 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1272 texture = DPSOFTRAST_Texture_GetByIndex(index);
1273 if (index && !texture)
1275 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1279 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1280 command->unitnum = unitnum;
1281 command->texture = texture;
1283 dpsoftrast.texbound[unitnum] = texture;
1284 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1287 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1289 dpsoftrast.pointer_vertex3f = vertex3f;
1290 dpsoftrast.stride_vertex = stride;
1292 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1294 dpsoftrast.pointer_color4f = color4f;
1295 dpsoftrast.pointer_color4ub = NULL;
1296 dpsoftrast.stride_color = stride;
1298 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1300 dpsoftrast.pointer_color4f = NULL;
1301 dpsoftrast.pointer_color4ub = color4ub;
1302 dpsoftrast.stride_color = stride;
1304 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1306 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1307 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1308 dpsoftrast.stride_texcoord[unitnum] = stride;
1311 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1312 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1314 thread->shader_mode = command->mode;
1315 thread->shader_permutation = command->permutation;
1317 void DPSOFTRAST_SetShader(int mode, int permutation)
1319 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1320 command->mode = mode;
1321 command->permutation = permutation;
1323 dpsoftrast.shader_mode = mode;
1324 dpsoftrast.shader_permutation = permutation;
1327 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1328 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1330 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1332 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1334 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1335 command->index = index;
1336 command->val[0] = v0;
1337 command->val[1] = v1;
1338 command->val[2] = v2;
1339 command->val[3] = v3;
1341 dpsoftrast.uniform4f[index*4+0] = v0;
1342 dpsoftrast.uniform4f[index*4+1] = v1;
1343 dpsoftrast.uniform4f[index*4+2] = v2;
1344 dpsoftrast.uniform4f[index*4+3] = v3;
1346 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1348 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1349 command->index = index;
1350 memcpy(command->val, v, sizeof(command->val));
1352 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1355 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1356 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1358 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1360 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1364 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1366 __m128 m0, m1, m2, m3;
1367 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1368 command->index = (DPSOFTRAST_UNIFORM)index;
1369 if (((size_t)v)&(ALIGN_SIZE-1))
1371 m0 = _mm_loadu_ps(v);
1372 m1 = _mm_loadu_ps(v+4);
1373 m2 = _mm_loadu_ps(v+8);
1374 m3 = _mm_loadu_ps(v+12);
1378 m0 = _mm_load_ps(v);
1379 m1 = _mm_load_ps(v+4);
1380 m2 = _mm_load_ps(v+8);
1381 m3 = _mm_load_ps(v+12);
1385 __m128 t0, t1, t2, t3;
1386 t0 = _mm_unpacklo_ps(m0, m1);
1387 t1 = _mm_unpacklo_ps(m2, m3);
1388 t2 = _mm_unpackhi_ps(m0, m1);
1389 t3 = _mm_unpackhi_ps(m2, m3);
1390 m0 = _mm_movelh_ps(t0, t1);
1391 m1 = _mm_movehl_ps(t1, t0);
1392 m2 = _mm_movelh_ps(t2, t3);
1393 m3 = _mm_movehl_ps(t3, t2);
1395 _mm_store_ps(command->val, m0);
1396 _mm_store_ps(command->val+4, m1);
1397 _mm_store_ps(command->val+8, m2);
1398 _mm_store_ps(command->val+12, m3);
1399 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1400 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1401 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1402 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1407 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1408 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1410 thread->uniform1i[command->index] = command->val;
1412 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1414 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1415 command->index = index;
1418 dpsoftrast.uniform1i[command->index] = i0;
1422 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1424 float *end = dst + size*4;
1425 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1429 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1438 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1445 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1447 float *end = dst + size*4;
1448 if (stride == sizeof(float[3]))
1450 float *end4 = dst + (size&~3)*4;
1451 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1455 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1456 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1457 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1458 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1459 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1460 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1461 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1462 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1463 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1464 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1465 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1466 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1467 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1469 src += 4*sizeof(float[3]);
1476 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1477 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1478 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1479 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1480 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1481 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1482 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1483 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1484 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1485 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1486 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1487 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1488 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1490 src += 4*sizeof(float[3]);
1494 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1498 __m128 v = _mm_loadu_ps((const float *)src);
1499 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1500 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1501 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1502 _mm_store_ps(dst, v);
1511 __m128 v = _mm_load_ps((const float *)src);
1512 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1513 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1514 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1515 _mm_store_ps(dst, v);
1522 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1524 float *end = dst + size*4;
1525 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1526 if (stride == sizeof(float[2]))
1528 float *end2 = dst + (size&~1)*4;
1529 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1533 __m128 v = _mm_loadu_ps((const float *)src);
1534 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1535 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1537 src += 2*sizeof(float[2]);
1544 __m128 v = _mm_load_ps((const float *)src);
1545 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1546 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1548 src += 2*sizeof(float[2]);
1554 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1560 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1562 float *end = dst + size*4;
1563 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1564 if (stride == sizeof(unsigned char[4]))
1566 float *end4 = dst + (size&~3)*4;
1567 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1571 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1572 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1573 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1574 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1575 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1577 src += 4*sizeof(unsigned char[4]);
1584 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1585 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1586 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1587 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1588 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1590 src += 4*sizeof(unsigned char[4]);
1596 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1597 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1603 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1605 float *end = dst + 4*size;
1606 __m128 v = _mm_loadu_ps(src);
1609 _mm_store_ps(dst, v);
1615 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1618 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1619 __m128 m0, m1, m2, m3;
1621 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1623 // fast case for identity matrix
1624 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1627 end = out4f + numitems*4;
1628 m0 = _mm_loadu_ps(inmatrix16f);
1629 m1 = _mm_loadu_ps(inmatrix16f + 4);
1630 m2 = _mm_loadu_ps(inmatrix16f + 8);
1631 m3 = _mm_loadu_ps(inmatrix16f + 12);
1632 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1636 __m128 v = _mm_loadu_ps(in4f);
1638 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1639 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1640 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1641 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1650 __m128 v = _mm_load_ps(in4f);
1652 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1653 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1654 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1655 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1663 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1665 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1669 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1671 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1672 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1673 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1674 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1677 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1679 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1680 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1681 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1682 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1685 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1688 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1689 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1690 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1691 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1694 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1696 int clipmask = 0xFF;
1697 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1698 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1699 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1700 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1701 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1702 #define BBFRONT(k, pos) \
1704 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1705 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1706 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1709 clipmask &= ~(1<<k); \
1710 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1711 minproj = _mm_min_ss(minproj, proj); \
1712 maxproj = _mm_max_ss(maxproj, proj); \
1716 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1717 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1718 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1719 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1720 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1721 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1725 if (clipmask&(1<<k)) \
1727 if (!(clipmask&(1<<(k^1)))) \
1729 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1730 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1731 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1732 minproj = _mm_min_ss(minproj, proj); \
1733 maxproj = _mm_max_ss(maxproj, proj); \
1735 if (!(clipmask&(1<<(k^2)))) \
1737 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1738 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1739 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1740 minproj = _mm_min_ss(minproj, proj); \
1741 maxproj = _mm_max_ss(maxproj, proj); \
1743 if (!(clipmask&(1<<(k^4)))) \
1745 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1746 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1747 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1748 minproj = _mm_min_ss(minproj, proj); \
1749 maxproj = _mm_max_ss(maxproj, proj); \
1753 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1754 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1755 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1756 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1757 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1758 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1759 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1760 *starty = _mm_cvttss_si32(maxproj);
1761 *endy = _mm_cvttss_si32(minproj)+1;
1765 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1767 float *end = out4f + numitems*4;
1768 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1769 __m128 minpos, maxpos;
1770 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1772 minpos = maxpos = _mm_loadu_ps(in4f);
1775 __m128 v = _mm_loadu_ps(in4f);
1776 minpos = _mm_min_ps(minpos, v);
1777 maxpos = _mm_max_ps(maxpos, v);
1778 _mm_store_ps(out4f, v);
1779 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1780 _mm_store_ps(screen4f, v);
1788 minpos = maxpos = _mm_load_ps(in4f);
1791 __m128 v = _mm_load_ps(in4f);
1792 minpos = _mm_min_ps(minpos, v);
1793 maxpos = _mm_max_ps(maxpos, v);
1794 _mm_store_ps(out4f, v);
1795 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1796 _mm_store_ps(screen4f, v);
1803 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale,
1804 _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1805 _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1806 _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1807 _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1811 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1813 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1814 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1816 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1817 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1818 end = out4f + numitems*4;
1819 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1820 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1821 m0 = _mm_loadu_ps(inmatrix16f);
1822 m1 = _mm_loadu_ps(inmatrix16f + 4);
1823 m2 = _mm_loadu_ps(inmatrix16f + 8);
1824 m3 = _mm_loadu_ps(inmatrix16f + 12);
1825 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1827 minpos = maxpos = _mm_loadu_ps(in4f);
1830 __m128 v = _mm_loadu_ps(in4f);
1831 minpos = _mm_min_ps(minpos, v);
1832 maxpos = _mm_max_ps(maxpos, v);
1833 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1834 _mm_store_ps(out4f, v);
1835 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1836 _mm_store_ps(screen4f, v);
1844 minpos = maxpos = _mm_load_ps(in4f);
1847 __m128 v = _mm_load_ps(in4f);
1848 minpos = _mm_min_ps(minpos, v);
1849 maxpos = _mm_max_ps(maxpos, v);
1850 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1851 _mm_store_ps(out4f, v);
1852 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1853 _mm_store_ps(screen4f, v);
1860 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3);
1865 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1868 float *outf = dpsoftrast.post_array4f[outarray];
1869 const unsigned char *inb;
1870 int firstvertex = dpsoftrast.firstvertex;
1871 int numvertices = dpsoftrast.numvertices;
1875 case DPSOFTRAST_ARRAY_POSITION:
1876 stride = dpsoftrast.stride_vertex;
1877 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1878 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1880 case DPSOFTRAST_ARRAY_COLOR:
1881 stride = dpsoftrast.stride_color;
1882 if (dpsoftrast.pointer_color4f)
1884 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1885 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1887 else if (dpsoftrast.pointer_color4ub)
1889 stride = dpsoftrast.stride_color;
1890 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1891 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1895 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1899 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1900 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1902 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1903 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1906 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1909 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1912 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1924 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1926 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1927 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1932 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1935 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1936 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1944 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1947 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1948 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1955 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1958 int startx = span->startx;
1959 int endx = span->endx;
1960 float wslope = triangle->w[0];
1961 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1962 float endz = 1.0f / (w + wslope * startx);
1963 for (x = startx;x < endx;)
1965 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1967 if (nextsub >= endx) nextsub = endsub = endx-1;
1968 endz = 1.0f / (w + wslope * nextsub);
1969 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1970 for (; x <= endsub; x++, z += dz)
1975 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1978 int startx = span->startx;
1979 int endx = span->endx;
1982 unsigned char * RESTRICT pixelmask = span->pixelmask;
1983 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1986 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1987 // handle alphatest now (this affects depth writes too)
1988 if (thread->alphatest)
1989 for (x = startx;x < endx;x++)
1990 if (in4f[x*4+3] < 0.5f)
1991 pixelmask[x] = false;
1992 // FIXME: this does not handle bigendian
1993 switch(thread->fb_blendmode)
1995 case DPSOFTRAST_BLENDMODE_OPAQUE:
1996 for (x = startx;x < endx;x++)
2000 d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
2001 d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
2002 d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
2003 d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
2004 pixel[x*4+0] = d[0];
2005 pixel[x*4+1] = d[1];
2006 pixel[x*4+2] = d[2];
2007 pixel[x*4+3] = d[3];
2010 case DPSOFTRAST_BLENDMODE_ALPHA:
2011 for (x = startx;x < endx;x++)
2015 a = in4f[x*4+3] * 255.0f;
2016 b = 1.0f - in4f[x*4+3];
2017 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2018 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2019 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2020 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2021 pixel[x*4+0] = d[0];
2022 pixel[x*4+1] = d[1];
2023 pixel[x*4+2] = d[2];
2024 pixel[x*4+3] = d[3];
2027 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2028 for (x = startx;x < endx;x++)
2032 a = in4f[x*4+3] * 255.0f;
2033 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2034 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2035 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2036 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2037 pixel[x*4+0] = d[0];
2038 pixel[x*4+1] = d[1];
2039 pixel[x*4+2] = d[2];
2040 pixel[x*4+3] = d[3];
2043 case DPSOFTRAST_BLENDMODE_ADD:
2044 for (x = startx;x < endx;x++)
2048 d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2049 d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2050 d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2051 d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2052 pixel[x*4+0] = d[0];
2053 pixel[x*4+1] = d[1];
2054 pixel[x*4+2] = d[2];
2055 pixel[x*4+3] = d[3];
2058 case DPSOFTRAST_BLENDMODE_INVMOD:
2059 for (x = startx;x < endx;x++)
2063 d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2064 d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2065 d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2066 d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2067 pixel[x*4+0] = d[0];
2068 pixel[x*4+1] = d[1];
2069 pixel[x*4+2] = d[2];
2070 pixel[x*4+3] = d[3];
2073 case DPSOFTRAST_BLENDMODE_MUL:
2074 for (x = startx;x < endx;x++)
2078 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2079 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2080 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2081 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2082 pixel[x*4+0] = d[0];
2083 pixel[x*4+1] = d[1];
2084 pixel[x*4+2] = d[2];
2085 pixel[x*4+3] = d[3];
2088 case DPSOFTRAST_BLENDMODE_MUL2:
2089 for (x = startx;x < endx;x++)
2093 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2094 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2095 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2096 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2097 pixel[x*4+0] = d[0];
2098 pixel[x*4+1] = d[1];
2099 pixel[x*4+2] = d[2];
2100 pixel[x*4+3] = d[3];
2103 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2104 for (x = startx;x < endx;x++)
2108 a = in4f[x*4+3] * -255.0f;
2109 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2110 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2111 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2112 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2113 pixel[x*4+0] = d[0];
2114 pixel[x*4+1] = d[1];
2115 pixel[x*4+2] = d[2];
2116 pixel[x*4+3] = d[3];
2119 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2120 for (x = startx;x < endx;x++)
2125 b = 1.0f - in4f[x*4+3];
2126 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2127 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2128 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2129 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2130 pixel[x*4+0] = d[0];
2131 pixel[x*4+1] = d[1];
2132 pixel[x*4+2] = d[2];
2133 pixel[x*4+3] = d[3];
2136 case DPSOFTRAST_BLENDMODE_INVADD:
2137 for (x = startx;x < endx;x++)
2141 d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2142 d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2143 d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2144 d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2145 pixel[x*4+0] = d[0];
2146 pixel[x*4+1] = d[1];
2147 pixel[x*4+2] = d[2];
2148 pixel[x*4+3] = d[3];
2154 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2158 int startx = span->startx;
2159 int endx = span->endx;
2160 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2161 unsigned char * RESTRICT pixelmask = span->pixelmask;
2162 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2163 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2166 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2167 pixeli += span->y * dpsoftrast.fb_width + span->x;
2168 // handle alphatest now (this affects depth writes too)
2169 if (thread->alphatest)
2170 for (x = startx;x < endx;x++)
2171 if (in4ub[x*4+3] < 0.5f)
2172 pixelmask[x] = false;
2173 // FIXME: this does not handle bigendian
2174 switch(thread->fb_blendmode)
2176 case DPSOFTRAST_BLENDMODE_OPAQUE:
2177 for (x = startx;x + 4 <= endx;)
2179 if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2181 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2195 case DPSOFTRAST_BLENDMODE_ALPHA:
2196 #define FINISHBLEND(blend2, blend1) \
2197 for (x = startx;x + 1 < endx;x += 2) \
2200 switch (*(const unsigned short*)&pixelmask[x]) \
2203 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2204 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2206 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2209 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2210 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2212 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2215 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2216 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2218 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2223 for(;x < endx; x++) \
2226 if (!pixelmask[x]) \
2228 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2229 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2231 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2235 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2236 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2238 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2239 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2242 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2244 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2245 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2247 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2248 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2251 case DPSOFTRAST_BLENDMODE_ADD:
2252 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2254 case DPSOFTRAST_BLENDMODE_INVMOD:
2256 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2258 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2261 case DPSOFTRAST_BLENDMODE_MUL:
2262 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2264 case DPSOFTRAST_BLENDMODE_MUL2:
2265 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2267 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2269 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2270 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2272 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2273 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2276 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2278 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2279 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2281 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2282 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2285 case DPSOFTRAST_BLENDMODE_INVADD:
2287 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2289 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2296 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2299 int startx = span->startx;
2300 int endx = span->endx;
2305 float tc[2], endtc[2];
2307 unsigned int tci[2];
2308 unsigned int tci1[2];
2309 unsigned int tcimin[2];
2310 unsigned int tcimax[2];
2315 const unsigned char * RESTRICT pixelbase;
2316 const unsigned char * RESTRICT pixel[4];
2317 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2318 // if no texture is bound, just fill it with white
2321 for (x = startx;x < endx;x++)
2323 out4f[x*4+0] = 1.0f;
2324 out4f[x*4+1] = 1.0f;
2325 out4f[x*4+2] = 1.0f;
2326 out4f[x*4+3] = 1.0f;
2330 mip = triangle->mip[texunitindex];
2331 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2332 // if this mipmap of the texture is 1 pixel, just fill it with that color
2333 if (texture->mipmap[mip][1] == 4)
2335 c[0] = texture->bytes[2] * (1.0f/255.0f);
2336 c[1] = texture->bytes[1] * (1.0f/255.0f);
2337 c[2] = texture->bytes[0] * (1.0f/255.0f);
2338 c[3] = texture->bytes[3] * (1.0f/255.0f);
2339 for (x = startx;x < endx;x++)
2341 out4f[x*4+0] = c[0];
2342 out4f[x*4+1] = c[1];
2343 out4f[x*4+2] = c[2];
2344 out4f[x*4+3] = c[3];
2348 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2349 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2350 flags = texture->flags;
2351 tcscale[0] = texture->mipmap[mip][2];
2352 tcscale[1] = texture->mipmap[mip][3];
2353 tciwidth = texture->mipmap[mip][2];
2356 tcimax[0] = texture->mipmap[mip][2]-1;
2357 tcimax[1] = texture->mipmap[mip][3]-1;
2358 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2359 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2360 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2361 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2362 for (x = startx;x < endx;)
2364 unsigned int subtc[2];
2365 unsigned int substep[2];
2366 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2367 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2368 if (nextsub >= endx)
2370 nextsub = endsub = endx-1;
2371 if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2375 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2376 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2377 substep[0] = (endtc[0] - tc[0]) * subscale;
2378 substep[1] = (endtc[1] - tc[1]) * subscale;
2379 subtc[0] = tc[0] * (1<<16);
2380 subtc[1] = tc[1] * (1<<16);
2383 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2385 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2387 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2388 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2389 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2390 tci[0] = subtc[0]>>16;
2391 tci[1] = subtc[1]>>16;
2392 tci1[0] = tci[0] + 1;
2393 tci1[1] = tci[1] + 1;
2394 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2395 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2396 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2397 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2398 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2399 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2400 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2401 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2402 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2403 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2404 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2405 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2406 out4f[x*4+0] = c[0];
2407 out4f[x*4+1] = c[1];
2408 out4f[x*4+2] = c[2];
2409 out4f[x*4+3] = c[3];
2414 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2416 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2417 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2418 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2419 tci[0] = subtc[0]>>16;
2420 tci[1] = subtc[1]>>16;
2421 tci1[0] = tci[0] + 1;
2422 tci1[1] = tci[1] + 1;
2423 tci[0] &= tciwrapmask[0];
2424 tci[1] &= tciwrapmask[1];
2425 tci1[0] &= tciwrapmask[0];
2426 tci1[1] &= tciwrapmask[1];
2427 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2428 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2429 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2430 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2431 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2432 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2433 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2434 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2435 out4f[x*4+0] = c[0];
2436 out4f[x*4+1] = c[1];
2437 out4f[x*4+2] = c[2];
2438 out4f[x*4+3] = c[3];
2442 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2444 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2446 tci[0] = subtc[0]>>16;
2447 tci[1] = subtc[1]>>16;
2448 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2449 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2450 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2451 c[0] = pixel[0][2] * (1.0f / 255.0f);
2452 c[1] = pixel[0][1] * (1.0f / 255.0f);
2453 c[2] = pixel[0][0] * (1.0f / 255.0f);
2454 c[3] = pixel[0][3] * (1.0f / 255.0f);
2455 out4f[x*4+0] = c[0];
2456 out4f[x*4+1] = c[1];
2457 out4f[x*4+2] = c[2];
2458 out4f[x*4+3] = c[3];
2463 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2465 tci[0] = subtc[0]>>16;
2466 tci[1] = subtc[1]>>16;
2467 tci[0] &= tciwrapmask[0];
2468 tci[1] &= tciwrapmask[1];
2469 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2470 c[0] = pixel[0][2] * (1.0f / 255.0f);
2471 c[1] = pixel[0][1] * (1.0f / 255.0f);
2472 c[2] = pixel[0][0] * (1.0f / 255.0f);
2473 c[3] = pixel[0][3] * (1.0f / 255.0f);
2474 out4f[x*4+0] = c[0];
2475 out4f[x*4+1] = c[1];
2476 out4f[x*4+2] = c[2];
2477 out4f[x*4+3] = c[3];
2483 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2487 int startx = span->startx;
2488 int endx = span->endx;
2490 __m128 data, slope, tcscale;
2491 __m128i tcsize, tcmask, tcoffset, tcmax;
2493 __m128i subtc, substep, endsubtc;
2496 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2497 const unsigned char * RESTRICT pixelbase;
2498 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2499 // if no texture is bound, just fill it with white
2502 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2505 mip = triangle->mip[texunitindex];
2506 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2507 // if this mipmap of the texture is 1 pixel, just fill it with that color
2508 if (texture->mipmap[mip][1] == 4)
2510 unsigned int k = *((const unsigned int *)pixelbase);
2511 for (x = startx;x < endx;x++)
2515 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2516 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2517 flags = texture->flags;
2518 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2519 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2520 tcscale = _mm_cvtepi32_ps(tcsize);
2521 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2522 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2523 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2524 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2525 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2526 tcmax = _mm_packs_epi32(tcmask, tcmask);
2527 for (x = startx;x < endx;)
2529 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2530 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2531 if (nextsub >= endx)
2533 nextsub = endsub = endx-1;
2534 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2538 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2539 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2540 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2541 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2542 substep = _mm_slli_epi32(substep, 1);
2545 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2546 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2548 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2549 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2551 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2552 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2553 tci = _mm_madd_epi16(tci, tcoffset);
2554 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2555 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2556 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2557 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2558 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2559 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2560 fracm = _mm_srli_epi16(subtc, 1);
2561 pix1 = _mm_add_epi16(pix1,
2562 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2563 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2564 pix3 = _mm_add_epi16(pix3,
2565 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2566 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2567 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2568 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2569 pix2 = _mm_add_epi16(pix2,
2570 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2571 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2572 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2576 const unsigned char * RESTRICT ptr1;
2577 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2578 tci = _mm_madd_epi16(tci, tcoffset);
2579 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2580 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2581 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2582 fracm = _mm_srli_epi16(subtc, 1);
2583 pix1 = _mm_add_epi16(pix1,
2584 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2585 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2586 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2587 pix1 = _mm_add_epi16(pix1,
2588 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2589 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2590 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2594 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2596 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2598 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2599 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2600 tci = _mm_madd_epi16(tci, tcoffset);
2601 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2602 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2603 _mm_setzero_si128());
2604 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2605 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2606 _mm_setzero_si128());
2607 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2608 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2609 tci = _mm_madd_epi16(tci, tcoffset);
2610 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2611 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2612 _mm_setzero_si128());
2613 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2614 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2615 _mm_setzero_si128());
2616 fracm = _mm_srli_epi16(subtc, 1);
2617 pix1 = _mm_add_epi16(pix1,
2618 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2619 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2620 pix3 = _mm_add_epi16(pix3,
2621 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2622 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2623 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2624 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2625 pix2 = _mm_add_epi16(pix2,
2626 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2627 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2628 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2632 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2633 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2634 tci = _mm_madd_epi16(tci, tcoffset);
2635 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2636 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2637 _mm_setzero_si128());
2638 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2639 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2640 _mm_setzero_si128());
2641 fracm = _mm_srli_epi16(subtc, 1);
2642 pix1 = _mm_add_epi16(pix1,
2643 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2644 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2645 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2646 pix1 = _mm_add_epi16(pix1,
2647 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2648 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2649 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2655 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2657 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2658 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2659 tci = _mm_madd_epi16(tci, tcoffset);
2660 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2661 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2662 _mm_setzero_si128());
2663 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2664 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2665 _mm_setzero_si128());
2666 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2667 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2668 tci = _mm_madd_epi16(tci, tcoffset);
2669 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2670 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2671 _mm_setzero_si128());
2672 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2673 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2674 _mm_setzero_si128());
2675 fracm = _mm_srli_epi16(subtc, 1);
2676 pix1 = _mm_add_epi16(pix1,
2677 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2678 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2679 pix3 = _mm_add_epi16(pix3,
2680 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2681 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2682 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2683 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2684 pix2 = _mm_add_epi16(pix2,
2685 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2686 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2687 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2691 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2692 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2693 tci = _mm_madd_epi16(tci, tcoffset);
2694 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2695 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2696 _mm_setzero_si128());
2697 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2698 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2699 _mm_setzero_si128());
2700 fracm = _mm_srli_epi16(subtc, 1);
2701 pix1 = _mm_add_epi16(pix1,
2702 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2703 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2704 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2705 pix1 = _mm_add_epi16(pix1,
2706 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2707 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2708 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2715 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2717 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2719 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2720 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2721 tci = _mm_madd_epi16(tci, tcoffset);
2722 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2723 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2727 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2728 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2729 tci = _mm_madd_epi16(tci, tcoffset);
2730 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2736 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2738 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2739 tci = _mm_and_si128(tci, tcmax);
2740 tci = _mm_madd_epi16(tci, tcoffset);
2741 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2742 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2746 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2747 tci = _mm_and_si128(tci, tcmax);
2748 tci = _mm_madd_epi16(tci, tcoffset);
2749 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2758 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2761 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2764 float DPSOFTRAST_SampleShadowmap(const float *vector)
2770 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2773 int startx = span->startx;
2774 int endx = span->endx;
2779 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2780 for (x = startx;x < endx;x++)
2783 c[0] = (data[0] + slope[0]*x) * z;
2784 c[1] = (data[1] + slope[1]*x) * z;
2785 c[2] = (data[2] + slope[2]*x) * z;
2786 c[3] = (data[3] + slope[3]*x) * z;
2787 out4f[x*4+0] = in4f[x*4+0] * c[0];
2788 out4f[x*4+1] = in4f[x*4+1] * c[1];
2789 out4f[x*4+2] = in4f[x*4+2] * c[2];
2790 out4f[x*4+3] = in4f[x*4+3] * c[3];
2794 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2797 int startx = span->startx;
2798 int endx = span->endx;
2803 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2804 for (x = startx;x < endx;x++)
2807 c[0] = (data[0] + slope[0]*x) * z;
2808 c[1] = (data[1] + slope[1]*x) * z;
2809 c[2] = (data[2] + slope[2]*x) * z;
2810 c[3] = (data[3] + slope[3]*x) * z;
2811 out4f[x*4+0] = c[0];
2812 out4f[x*4+1] = c[1];
2813 out4f[x*4+2] = c[2];
2814 out4f[x*4+3] = c[3];
2818 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2820 int x, startx = span->startx, endx = span->endx;
2821 float c[4], localcolor[4];
2822 localcolor[0] = subcolor[0];
2823 localcolor[1] = subcolor[1];
2824 localcolor[2] = subcolor[2];
2825 localcolor[3] = subcolor[3];
2826 for (x = startx;x < endx;x++)
2828 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2829 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2830 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2831 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2832 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2833 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2834 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2835 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2839 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2841 int x, startx = span->startx, endx = span->endx;
2842 for (x = startx;x < endx;x++)
2844 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2845 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2846 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2847 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2851 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2853 int x, startx = span->startx, endx = span->endx;
2854 for (x = startx;x < endx;x++)
2856 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2857 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2858 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2859 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2863 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2865 int x, startx = span->startx, endx = span->endx;
2867 for (x = startx;x < endx;x++)
2869 a = 1.0f - inb4f[x*4+3];
2871 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2872 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2873 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2874 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2878 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2880 int x, startx = span->startx, endx = span->endx;
2881 float localcolor[4], ilerp, lerp;
2882 localcolor[0] = color[0];
2883 localcolor[1] = color[1];
2884 localcolor[2] = color[2];
2885 localcolor[3] = color[3];
2886 ilerp = 1.0f - localcolor[3];
2887 lerp = localcolor[3];
2888 for (x = startx;x < endx;x++)
2890 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2891 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2892 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2893 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2899 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2903 int startx = span->startx;
2904 int endx = span->endx;
2907 __m128i submod, substep, endsubmod;
2908 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2909 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2910 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2911 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2912 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2913 for (x = startx; x < endx;)
2915 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2916 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2917 if (nextsub >= endx)
2919 nextsub = endsub = endx-1;
2920 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2924 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2925 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2926 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2927 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2928 substep = _mm_packs_epi32(substep, substep);
2929 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2931 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2932 pix = _mm_mulhi_epu16(pix, submod);
2933 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2937 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2938 pix = _mm_mulhi_epu16(pix, submod);
2939 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2946 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2950 int startx = span->startx;
2951 int endx = span->endx;
2954 __m128i submod, substep, endsubmod;
2955 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2956 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2957 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2958 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2959 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2960 for (x = startx; x < endx;)
2962 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2963 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2964 if (nextsub >= endx)
2966 nextsub = endsub = endx-1;
2967 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2971 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2972 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2973 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2974 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2975 substep = _mm_packs_epi32(substep, substep);
2976 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2978 __m128i pix = _mm_srai_epi16(submod, 4);
2979 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2983 __m128i pix = _mm_srai_epi16(submod, 4);
2984 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2991 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2994 int x, startx = span->startx, endx = span->endx;
2995 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2996 localcolor = _mm_packs_epi32(localcolor, localcolor);
2997 for (x = startx;x+2 <= endx;x+=2)
2999 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3000 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3001 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
3002 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3006 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3007 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3008 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
3009 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3014 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3017 int x, startx = span->startx, endx = span->endx;
3018 for (x = startx;x+2 <= endx;x+=2)
3020 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3021 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3022 pix1 = _mm_mulhi_epu16(pix1, pix2);
3023 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3027 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3028 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3029 pix1 = _mm_mulhi_epu16(pix1, pix2);
3030 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3035 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3038 int x, startx = span->startx, endx = span->endx;
3039 for (x = startx;x+2 <= endx;x+=2)
3041 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3042 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3043 pix1 = _mm_add_epi16(pix1, pix2);
3044 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3048 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3049 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3050 pix1 = _mm_add_epi16(pix1, pix2);
3051 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3056 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3059 int x, startx = span->startx, endx = span->endx;
3060 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3061 tint = _mm_packs_epi32(tint, tint);
3062 for (x = startx;x+2 <= endx;x+=2)
3064 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3065 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3066 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3067 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3071 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3072 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3073 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3074 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3079 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3082 int x, startx = span->startx, endx = span->endx;
3083 for (x = startx;x+2 <= endx;x+=2)
3085 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3086 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3087 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3088 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3089 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3093 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3094 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3095 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3096 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3097 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3102 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3105 int x, startx = span->startx, endx = span->endx;
3106 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3107 localcolor = _mm_packs_epi32(localcolor, localcolor);
3108 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3109 for (x = startx;x+2 <= endx;x+=2)
3111 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3112 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3113 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3117 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3118 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3119 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3126 void DPSOFTRAST_VertexShader_Generic(void)
3128 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3129 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3130 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3131 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3132 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3135 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3137 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3138 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3139 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3140 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3141 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3142 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3144 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3145 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3146 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3148 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3149 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3152 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3154 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3157 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3159 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3162 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3167 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3168 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3173 void DPSOFTRAST_VertexShader_PostProcess(void)
3175 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3176 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3177 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3180 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3182 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3183 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3184 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3185 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3186 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3187 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3188 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3190 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3191 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3193 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3194 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3196 // TODO: implement saturation
3198 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3200 // TODO: implement gammaramps
3202 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3207 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3209 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3212 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3214 // this is never called (because colormask is off when this shader is used)
3215 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3216 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3217 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3218 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3219 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3224 void DPSOFTRAST_VertexShader_FlatColor(void)
3226 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3227 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3230 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3233 unsigned char * RESTRICT pixelmask = span->pixelmask;
3234 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3235 int x, startx = span->startx, endx = span->endx;
3236 __m128i Color_Ambientm;
3237 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3238 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3239 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3240 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3241 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3242 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3243 pixel = buffer_FragColorbgra8;
3244 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3245 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3246 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3247 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3248 for (x = startx;x < endx;x++)
3251 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3254 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3255 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3256 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3257 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3263 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3264 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3265 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3267 if (pixel == buffer_FragColorbgra8)
3268 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3274 void DPSOFTRAST_VertexShader_VertexColor(void)
3276 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3277 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3278 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3281 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3284 unsigned char * RESTRICT pixelmask = span->pixelmask;
3285 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3286 int x, startx = span->startx, endx = span->endx;
3287 __m128i Color_Ambientm, Color_Diffusem;
3289 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3290 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3291 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3292 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3293 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3294 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3295 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3296 pixel = buffer_FragColorbgra8;
3297 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3298 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3299 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3300 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3301 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3302 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3303 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3304 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3305 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3306 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3307 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3308 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3309 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3310 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3312 __m128i color, mod, pix;
3313 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3316 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3317 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3318 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3319 data = _mm_add_ps(data, slope);
3320 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3321 data = _mm_add_ps(data, slope);
3322 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3323 data = _mm_add_ps(data, slope);
3324 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3325 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3326 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3327 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3328 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3329 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3335 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3336 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3337 mod = _mm_packs_epi32(mod, mod);
3338 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3339 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3341 if (pixel == buffer_FragColorbgra8)
3342 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3348 void DPSOFTRAST_VertexShader_Lightmap(void)
3350 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3351 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3352 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3355 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3358 unsigned char * RESTRICT pixelmask = span->pixelmask;
3359 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3360 int x, startx = span->startx, endx = span->endx;
3361 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3362 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3363 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3364 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3365 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3366 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3367 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3368 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3369 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3370 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3371 pixel = buffer_FragColorbgra8;
3372 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3373 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3374 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3375 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3376 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3377 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3378 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3379 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3381 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3382 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3383 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3384 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3385 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3386 for (x = startx;x < endx;x++)
3388 __m128i color, lightmap, glow, pix;
3389 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3392 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3393 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3394 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3395 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3396 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3397 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3398 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3399 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3400 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3401 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3407 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3408 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3409 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3410 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3411 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3412 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3417 for (x = startx;x < endx;x++)
3419 __m128i color, lightmap, pix;
3420 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3423 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3424 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3425 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3426 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3427 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3428 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3429 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3435 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3436 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3437 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3438 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3441 if (pixel == buffer_FragColorbgra8)
3442 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3448 void DPSOFTRAST_VertexShader_FakeLight(void)
3450 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3453 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3456 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3457 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3458 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3459 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3460 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3465 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3467 DPSOFTRAST_VertexShader_Lightmap();
3470 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3472 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3478 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3480 DPSOFTRAST_VertexShader_Lightmap();
3483 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3485 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3491 void DPSOFTRAST_VertexShader_LightDirection(void)
3494 int numvertices = dpsoftrast.numvertices;
3496 float LightVector[4];
3497 float EyePosition[4];
3498 float EyeVectorModelSpace[4];
3504 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3505 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3506 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3507 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3508 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3509 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3510 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3511 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3512 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3513 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3514 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3515 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3516 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3517 for (i = 0;i < numvertices;i++)
3519 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3520 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3521 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3522 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3523 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3524 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3525 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3526 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3527 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3528 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3529 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3530 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3531 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3532 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3533 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3534 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3535 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3536 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3537 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3538 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3539 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3540 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3541 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3542 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3543 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3544 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3545 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3546 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3547 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3549 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3552 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3553 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3554 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3555 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3556 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3557 #define DPSOFTRAST_Vector3Normalize(v)\
3560 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3571 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3573 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3574 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3575 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3576 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3577 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3578 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3579 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3580 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3581 int x, startx = span->startx, endx = span->endx;
3582 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3583 float LightVectordata[4];
3584 float LightVectorslope[4];
3585 float EyeVectordata[4];
3586 float EyeVectorslope[4];
3588 float diffusetex[4];
3590 float surfacenormal[4];
3591 float lightnormal[4];
3593 float specularnormal[4];
3596 float SpecularPower;
3598 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3599 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3600 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3601 Color_Glow[3] = 0.0f;
3602 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3603 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3604 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3605 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3606 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3607 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3608 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3609 Color_Pants[3] = 0.0f;
3610 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3611 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3612 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3613 Color_Shirt[3] = 0.0f;
3614 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3615 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3616 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3618 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3619 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3621 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3623 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3625 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3627 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3628 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3629 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3630 Color_Diffuse[3] = 0.0f;
3631 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3632 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3633 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3634 LightColor[3] = 0.0f;
3635 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3636 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3637 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3638 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3639 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3640 Color_Specular[3] = 0.0f;
3641 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3642 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3643 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3644 for (x = startx;x < endx;x++)
3647 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3648 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3649 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3650 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3651 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3653 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3654 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3655 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3656 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3658 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3659 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3660 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3661 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3662 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3663 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3664 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3665 DPSOFTRAST_Vector3Normalize(surfacenormal);
3667 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3668 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3669 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3670 DPSOFTRAST_Vector3Normalize(lightnormal);
3672 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3673 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3674 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3675 DPSOFTRAST_Vector3Normalize(eyenormal);
3677 specularnormal[0] = lightnormal[0] + eyenormal[0];
3678 specularnormal[1] = lightnormal[1] + eyenormal[1];
3679 specularnormal[2] = lightnormal[2] + eyenormal[2];
3680 DPSOFTRAST_Vector3Normalize(specularnormal);
3682 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3683 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3684 specular = pow(specular, SpecularPower * glosstex[3]);
3685 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3687 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3688 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3689 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3690 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3694 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3695 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3696 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3697 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3699 buffer_FragColorbgra8[x*4+0] = d[0];
3700 buffer_FragColorbgra8[x*4+1] = d[1];
3701 buffer_FragColorbgra8[x*4+2] = d[2];
3702 buffer_FragColorbgra8[x*4+3] = d[3];
3705 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3707 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3708 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3709 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3710 Color_Diffuse[3] = 0.0f;
3711 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3712 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3713 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3714 LightColor[3] = 0.0f;
3715 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3716 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3717 for (x = startx;x < endx;x++)
3720 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3721 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3722 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3723 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3724 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3725 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3726 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3727 DPSOFTRAST_Vector3Normalize(surfacenormal);
3729 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3730 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3731 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3732 DPSOFTRAST_Vector3Normalize(lightnormal);
3734 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3735 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3737 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3738 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3739 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3740 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3744 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3745 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3746 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3747 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3749 buffer_FragColorbgra8[x*4+0] = d[0];
3750 buffer_FragColorbgra8[x*4+1] = d[1];
3751 buffer_FragColorbgra8[x*4+2] = d[2];
3752 buffer_FragColorbgra8[x*4+3] = d[3];
3757 for (x = startx;x < endx;x++)
3760 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3761 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3762 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3763 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3765 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3767 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3768 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3769 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3770 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3774 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3775 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3776 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3777 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3779 buffer_FragColorbgra8[x*4+0] = d[0];
3780 buffer_FragColorbgra8[x*4+1] = d[1];
3781 buffer_FragColorbgra8[x*4+2] = d[2];
3782 buffer_FragColorbgra8[x*4+3] = d[3];
3785 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3790 void DPSOFTRAST_VertexShader_LightSource(void)
3793 int numvertices = dpsoftrast.numvertices;
3794 float LightPosition[4];
3795 float LightVector[4];
3796 float LightVectorModelSpace[4];
3797 float EyePosition[4];
3798 float EyeVectorModelSpace[4];
3804 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3805 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3806 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3807 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3808 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3809 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3810 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3811 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3812 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3813 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3814 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3815 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3816 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3817 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3818 for (i = 0;i < numvertices;i++)
3820 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3821 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3822 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3823 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3824 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3825 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3826 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3827 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3828 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3829 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3830 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3831 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3832 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3833 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3834 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3835 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3836 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3837 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
3838 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3839 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3840 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3841 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3842 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3843 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3844 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3845 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3846 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3847 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3848 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3849 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3850 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3851 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3853 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3854 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3857 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3860 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3861 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3862 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3863 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3864 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3865 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3866 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3867 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3868 int x, startx = span->startx, endx = span->endx;
3869 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3870 float CubeVectordata[4];
3871 float CubeVectorslope[4];
3872 float LightVectordata[4];
3873 float LightVectorslope[4];
3874 float EyeVectordata[4];
3875 float EyeVectorslope[4];
3877 float diffusetex[4];
3879 float surfacenormal[4];
3880 float lightnormal[4];
3882 float specularnormal[4];
3885 float SpecularPower;
3886 float CubeVector[4];
3889 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3890 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3891 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3892 Color_Glow[3] = 0.0f;
3893 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3894 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3895 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3896 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3897 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3898 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3899 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3900 Color_Diffuse[3] = 0.0f;
3901 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3902 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3903 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3904 Color_Specular[3] = 0.0f;
3905 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3906 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3907 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3908 Color_Pants[3] = 0.0f;
3909 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3910 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3911 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3912 Color_Shirt[3] = 0.0f;
3913 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3914 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3915 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3916 LightColor[3] = 0.0f;
3917 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3918 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3919 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3920 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3921 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3922 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
3923 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3924 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3926 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3927 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3929 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3930 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
3931 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3933 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3934 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3935 for (x = startx;x < endx;x++)
3938 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3939 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3940 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3941 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3942 if (attenuation < 0.01f)
3944 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3946 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3947 if (attenuation < 0.01f)
3951 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3952 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3953 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3954 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3955 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3957 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3958 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3959 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3960 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3962 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3963 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3964 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3965 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3966 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3967 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3968 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3969 DPSOFTRAST_Vector3Normalize(surfacenormal);
3971 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3972 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3973 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3974 DPSOFTRAST_Vector3Normalize(lightnormal);
3976 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3977 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3978 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3979 DPSOFTRAST_Vector3Normalize(eyenormal);
3981 specularnormal[0] = lightnormal[0] + eyenormal[0];
3982 specularnormal[1] = lightnormal[1] + eyenormal[1];
3983 specularnormal[2] = lightnormal[2] + eyenormal[2];
3984 DPSOFTRAST_Vector3Normalize(specularnormal);
3986 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3987 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3988 specular = pow(specular, SpecularPower * glosstex[3]);
3989 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3991 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3992 attenuation *= (1.0f / 255.0f);
3993 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3994 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3995 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3996 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4000 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4001 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4002 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4003 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4005 buffer_FragColorbgra8[x*4+0] = d[0];
4006 buffer_FragColorbgra8[x*4+1] = d[1];
4007 buffer_FragColorbgra8[x*4+2] = d[2];
4008 buffer_FragColorbgra8[x*4+3] = d[3];
4011 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4013 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4014 for (x = startx;x < endx;x++)
4017 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4018 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4019 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4020 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4021 if (attenuation < 0.01f)
4023 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4025 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4026 if (attenuation < 0.01f)
4030 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4031 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4032 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4033 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4034 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4036 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4037 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4038 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4039 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4041 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4042 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4043 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4044 DPSOFTRAST_Vector3Normalize(surfacenormal);
4046 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4047 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4048 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4049 DPSOFTRAST_Vector3Normalize(lightnormal);
4051 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4052 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4054 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4055 attenuation *= (1.0f / 255.0f);
4056 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4057 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4058 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4059 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4063 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4064 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4065 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4066 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4068 buffer_FragColorbgra8[x*4+0] = d[0];
4069 buffer_FragColorbgra8[x*4+1] = d[1];
4070 buffer_FragColorbgra8[x*4+2] = d[2];
4071 buffer_FragColorbgra8[x*4+3] = d[3];
4076 for (x = startx;x < endx;x++)
4079 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4080 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4081 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4082 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4083 if (attenuation < 0.01f)
4085 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4087 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4088 if (attenuation < 0.01f)
4092 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4093 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4094 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4095 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4096 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4098 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4099 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4100 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4101 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4103 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4105 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4106 attenuation *= (1.0f / 255.0f);
4107 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4108 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4109 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4110 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4114 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4115 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4116 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4117 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4119 buffer_FragColorbgra8[x*4+0] = d[0];
4120 buffer_FragColorbgra8[x*4+1] = d[1];
4121 buffer_FragColorbgra8[x*4+2] = d[2];
4122 buffer_FragColorbgra8[x*4+3] = d[3];
4125 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4131 void DPSOFTRAST_VertexShader_Refraction(void)
4133 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4136 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4139 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4140 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4141 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4142 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4143 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4148 void DPSOFTRAST_VertexShader_Water(void)
4150 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4154 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4157 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4158 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4159 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4160 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4161 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4166 void DPSOFTRAST_VertexShader_ShowDepth(void)
4168 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4171 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4174 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4175 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4176 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4177 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4178 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4183 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4185 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4188 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4191 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4192 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4193 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4194 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4195 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4200 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4202 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4205 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4208 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4209 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4210 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4211 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4212 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4217 typedef struct DPSOFTRAST_ShaderModeInfo_s
4220 void (*Vertex)(void);
4221 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4222 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4223 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4225 DPSOFTRAST_ShaderModeInfo;
4227 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4229 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4230 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4231 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4232 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4233 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4234 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4235 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {~0}, {~0}},
4236 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4237 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4238 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4239 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4240 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {~0}},
4241 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4242 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4243 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4244 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4247 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4254 // unsigned int *colorpixel;
4255 unsigned int *depthpixel;
4261 DPSOFTRAST_State_Triangle *triangle;
4262 DPSOFTRAST_State_Span *span;
4263 unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4264 for (i = 0; i < thread->numspans; i++)
4266 span = &thread->spans[i];
4267 triangle = &thread->triangles[span->triangle];
4268 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4270 wslope = triangle->w[0];
4271 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4272 depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4273 depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4274 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4275 startx = span->startx;
4277 switch(thread->fb_depthfunc)
4280 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4281 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4282 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4283 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4284 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4285 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4286 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4288 //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4289 //for (x = startx;x < endx;x++)
4290 // colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4291 // if there is no color buffer, skip pixel shader
4292 while (startx < endx && !pixelmask[startx])
4294 while (endx > startx && !pixelmask[endx-1])
4297 continue; // no pixels to fill
4298 span->pixelmask = pixelmask;
4299 span->startx = startx;
4301 // run pixel shader if appropriate
4302 // do this before running depthmask code, to allow the pixelshader
4303 // to clear pixelmask values for alpha testing
4304 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4305 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4306 if (thread->depthmask)
4307 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4313 // no depth testing means we're just dealing with color...
4314 // if there is no color buffer, skip pixel shader
4315 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4317 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4318 span->pixelmask = pixelmask;
4319 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4323 thread->numspans = 0;
4326 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4328 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4331 int cullface = thread->cullface;
4332 int minx, maxx, miny, maxy;
4333 int miny1, maxy1, miny2, maxy2;
4334 __m128i fbmin, fbmax;
4335 __m128 viewportcenter, viewportscale;
4336 int firstvertex = command->firstvertex;
4337 int numvertices = command->numvertices;
4338 int numtriangles = command->numtriangles;
4339 const int *element3i = command->element3i;
4340 const unsigned short *element3s = command->element3s;
4341 int clipped = command->clipped;
4348 int starty, endy, bandy;
4352 __m128 triangleedge1, triangleedge2, trianglenormal;
4355 DPSOFTRAST_State_Triangle *triangle;
4356 DPSOFTRAST_Texture *texture;
4357 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4358 miny = thread->fb_scissor[1];
4359 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4360 miny1 = bound(miny, thread->miny1, maxy);
4361 maxy1 = bound(miny, thread->maxy1, maxy);
4362 miny2 = bound(miny, thread->miny2, maxy);
4363 maxy2 = bound(miny, thread->maxy2, maxy);
4364 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4366 if (!ATOMIC_DECREMENT(command->refcount))
4368 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4369 MM_FREE(command->arrays);
4373 minx = thread->fb_scissor[0];
4374 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4375 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4376 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4377 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4378 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4379 screen[3] = _mm_setzero_ps();
4380 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4381 for (i = 0;i < numtriangles;i++)
4383 const float *screencoord4f = command->arrays;
4384 const float *arrays = screencoord4f + numvertices*4;
4386 // generate the 3 edges of this triangle
4387 // generate spans for the triangle - switch based on left split or right split classification of triangle
4390 e[0] = element3s[i*3+0] - firstvertex;
4391 e[1] = element3s[i*3+1] - firstvertex;
4392 e[2] = element3s[i*3+2] - firstvertex;
4396 e[0] = element3i[i*3+0] - firstvertex;
4397 e[1] = element3i[i*3+1] - firstvertex;
4398 e[2] = element3i[i*3+2] - firstvertex;
4407 #define SKIPBACKFACE \
4408 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4409 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4410 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4411 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4412 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4416 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4420 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4425 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4426 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4428 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4429 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4431 #define CLIPPEDVERTEXCOPY(k,p1) \
4432 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4434 #define GENATTRIBCOPY(attrib, p1) \
4435 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4436 #define GENATTRIBLERP(attrib, p1, p2) \
4438 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4439 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4441 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4445 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4446 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4447 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4448 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4449 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4450 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4451 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4457 // calculate distance from nearplane
4458 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4459 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4460 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4461 if (clipdist[0] >= 0.0f)
4463 if (clipdist[1] >= 0.0f)
4465 if (clipdist[2] >= 0.0f)
4468 // triangle is entirely in front of nearplane
4469 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4476 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4484 if (clipdist[2] >= 0.0f)
4486 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4493 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4500 else if (clipdist[1] >= 0.0f)
4502 if (clipdist[2] >= 0.0f)
4504 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4511 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4517 else if (clipdist[2] >= 0.0f)
4519 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4524 else continue; // triangle is entirely behind nearplane
4527 // calculate integer y coords for triangle points
4528 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4529 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4530 screenmin = _mm_min_epi16(screeni, screenir),
4531 screenmax = _mm_max_epi16(screeni, screenir);
4532 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4533 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4534 screenmin = _mm_max_epi16(screenmin, fbmin);
4535 screenmax = _mm_min_epi16(screenmax, fbmax);
4536 // skip offscreen triangles
4537 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4539 starty = _mm_extract_epi16(screenmin, 1);
4540 endy = _mm_extract_epi16(screenmax, 1)+1;
4541 if (starty >= maxy1 && endy <= miny2)
4543 screeny = _mm_srai_epi32(screeni, 16);
4546 triangle = &thread->triangles[thread->numtriangles];
4548 // calculate attribute plans for triangle data...
4549 // okay, this triangle is going to produce spans, we'd better project
4550 // the interpolants now (this is what gives perspective texturing),
4551 // this consists of simply multiplying all arrays by the W coord
4552 // (which is basically 1/Z), which will be undone per-pixel
4553 // (multiplying by Z again) to get the perspective-correct array
4556 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4557 __m128 mipedgescale, mipdensity;
4558 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4559 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4560 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4561 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4562 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4563 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4564 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4565 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4566 attribedge1 = _mm_sub_ss(w0, w1);
4567 attribedge2 = _mm_sub_ss(w2, w1);
4568 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4569 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4570 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4571 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4572 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4573 _mm_store_ss(&triangle->w[0], attribxslope);
4574 _mm_store_ss(&triangle->w[1], attribyslope);
4575 _mm_store_ss(&triangle->w[2], attriborigin);
4576 mipedgescale = _mm_setzero_ps();
4577 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4579 __m128 attrib0, attrib1, attrib2;
4580 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4581 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4583 arrays += numvertices*4;
4584 GENATTRIBS(attrib0, attrib1, attrib2);
4585 attriborigin = _mm_mul_ps(attrib1, w1);
4586 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4587 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4588 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4589 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4590 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4591 _mm_stream_ps(triangle->attribs[k][0], attribxslope);
4592 _mm_stream_ps(triangle->attribs[k][1], attribyslope);
4593 _mm_stream_ps(triangle->attribs[k][2], attriborigin);
4594 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4596 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4597 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4598 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4599 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4603 memset(triangle->mip, 0, sizeof(triangle->mip));
4604 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4606 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4607 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4609 texture = thread->texbound[texunit];
4610 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4612 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4613 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4614 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4615 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4616 // this will be multiplied in the texturing routine by the texture resolution
4617 y = _mm_cvtss_si32(mipdensity);
4620 y = (int)(log((float)y)*0.5f/M_LN2);
4621 if (y > texture->mipmaps - 1)
4622 y = texture->mipmaps - 1;
4623 triangle->mip[texunit] = y;
4629 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4632 __m128 xcoords, xslope;
4633 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4634 int yccmask = _mm_movemask_epi8(ycc);
4635 int edge0p, edge0n, edge1p, edge1n;
4642 case 0xFFFF: /*0000*/ y = endy; continue;
4643 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4644 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4645 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4646 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4647 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4648 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4649 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4650 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4651 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4652 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4653 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4654 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4655 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4656 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4657 case 0x0000: /*1111*/ y++; continue;
4665 case 0xFFFF: /*000*/ y = endy; continue;
4666 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4667 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4668 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4669 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4670 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4671 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4672 case 0x0000: /*111*/ y++; continue;
4675 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
4676 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4677 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4678 nexty = _mm_extract_epi16(ycc, 0);
4679 if (nexty >= bandy) nexty = bandy-1;
4680 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
4681 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4682 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
4683 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4684 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4685 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
4687 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
4688 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
4690 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4692 int startx, endx, offset;
4693 startx = _mm_cvtss_si32(xcoords);
4694 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4697 if (startx < 0) startx = 0;
4698 startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
4700 if (endx > maxx) endx = maxx;
4701 if (startx >= endx) continue;
4702 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
4704 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4705 span->triangle = thread->numtriangles;
4708 span->startx = max(minx - offset, 0);
4709 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
4710 if (span->startx >= span->endx)
4712 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4713 DPSOFTRAST_Draw_ProcessSpans(thread);
4718 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
4720 DPSOFTRAST_Draw_ProcessSpans(thread);
4721 thread->numtriangles = 0;
4725 if (!ATOMIC_DECREMENT(command->refcount))
4727 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4728 MM_FREE(command->arrays);
4731 if (thread->numspans > 0 || thread->numtriangles > 0)
4733 DPSOFTRAST_Draw_ProcessSpans(thread);
4734 thread->numtriangles = 0;
4739 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4743 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
4744 int datasize = 2*numvertices*sizeof(float[4]);
4745 DPSOFTRAST_Command_Draw *command;
4746 unsigned char *data;
4747 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4749 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4750 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4752 datasize += numvertices*sizeof(float[4]);
4755 datasize += numtriangles*sizeof(unsigned short[3]);
4757 datasize += numtriangles*sizeof(int[3]);
4758 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
4759 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
4761 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
4762 data = (unsigned char *)MM_CALLOC(datasize, 1);
4766 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
4767 data = (unsigned char *)command + commandsize;
4769 command->firstvertex = firstvertex;
4770 command->numvertices = numvertices;
4771 command->numtriangles = numtriangles;
4772 command->arrays = (float *)data;
4773 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
4774 dpsoftrast.firstvertex = firstvertex;
4775 dpsoftrast.numvertices = numvertices;
4776 dpsoftrast.screencoord4f = (float *)data;
4777 data += numvertices*sizeof(float[4]);
4778 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
4779 data += numvertices*sizeof(float[4]);
4780 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4782 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4783 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4785 dpsoftrast.post_array4f[j] = (float *)data;
4786 data += numvertices*sizeof(float[4]);
4788 command->element3i = NULL;
4789 command->element3s = NULL;
4792 command->element3s = (unsigned short *)data;
4793 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
4797 command->element3i = (int *)data;
4798 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
4803 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4805 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
4806 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4807 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
4808 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
4809 if (command->starty >= command->endy)
4811 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4812 MM_FREE(command->arrays);
4813 DPSOFTRAST_UndoCommand(command->commandsize);
4816 command->clipped = dpsoftrast.drawclipped;
4817 command->refcount = dpsoftrast.numthreads;
4819 if (dpsoftrast.usethreads)
4822 DPSOFTRAST_Draw_SyncCommands();
4823 for (i = 0; i < dpsoftrast.numthreads; i++)
4825 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4826 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
4827 Thread_CondSignal(thread->drawcond);
4832 DPSOFTRAST_Draw_FlushThreads();
4836 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
4838 int commandoffset = thread->commandoffset;
4839 while (commandoffset != endoffset)
4841 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4842 switch (command->opcode)
4844 #define INTERPCOMMAND(name) \
4845 case DPSOFTRAST_OPCODE_##name : \
4846 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4847 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
4848 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4849 commandoffset = 0; \
4851 INTERPCOMMAND(Viewport)
4852 INTERPCOMMAND(ClearColor)
4853 INTERPCOMMAND(ClearDepth)
4854 INTERPCOMMAND(ColorMask)
4855 INTERPCOMMAND(DepthTest)
4856 INTERPCOMMAND(ScissorTest)
4857 INTERPCOMMAND(Scissor)
4858 INTERPCOMMAND(BlendFunc)
4859 INTERPCOMMAND(BlendSubtract)
4860 INTERPCOMMAND(DepthMask)
4861 INTERPCOMMAND(DepthFunc)
4862 INTERPCOMMAND(DepthRange)
4863 INTERPCOMMAND(PolygonOffset)
4864 INTERPCOMMAND(CullFace)
4865 INTERPCOMMAND(AlphaTest)
4866 INTERPCOMMAND(AlphaFunc)
4867 INTERPCOMMAND(SetTexture)
4868 INTERPCOMMAND(SetShader)
4869 INTERPCOMMAND(Uniform4f)
4870 INTERPCOMMAND(UniformMatrix4f)
4871 INTERPCOMMAND(Uniform1i)
4873 case DPSOFTRAST_OPCODE_Draw:
4874 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
4875 commandoffset += command->commandsize;
4876 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
4878 thread->commandoffset = commandoffset;
4881 case DPSOFTRAST_OPCODE_Reset:
4886 thread->commandoffset = commandoffset;
4889 static int DPSOFTRAST_Draw_Thread(void *data)
4891 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4892 while(thread->index >= 0)
4894 if (thread->commandoffset != dpsoftrast.drawcommand)
4896 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4900 Thread_LockMutex(thread->drawmutex);
4901 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
4903 if (thread->waiting) Thread_CondSignal(thread->waitcond);
4904 thread->starving = true;
4905 Thread_CondWait(thread->drawcond, thread->drawmutex);
4906 thread->starving = false;
4908 Thread_UnlockMutex(thread->drawmutex);
4914 static void DPSOFTRAST_Draw_FlushThreads(void)
4916 DPSOFTRAST_State_Thread *thread;
4918 DPSOFTRAST_Draw_SyncCommands();
4919 if (dpsoftrast.usethreads)
4921 for (i = 0; i < dpsoftrast.numthreads; i++)
4923 thread = &dpsoftrast.threads[i];
4924 if (thread->commandoffset != dpsoftrast.drawcommand)
4926 Thread_LockMutex(thread->drawmutex);
4927 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
4928 Thread_CondSignal(thread->drawcond);
4929 Thread_UnlockMutex(thread->drawmutex);
4932 for (i = 0; i < dpsoftrast.numthreads; i++)
4934 thread = &dpsoftrast.threads[i];
4935 if (thread->commandoffset != dpsoftrast.drawcommand)
4937 Thread_LockMutex(thread->drawmutex);
4938 if (thread->commandoffset != dpsoftrast.drawcommand)
4940 thread->waiting = true;
4941 Thread_CondWait(thread->waitcond, thread->drawmutex);
4942 thread->waiting = false;
4944 Thread_UnlockMutex(thread->drawmutex);
4950 for (i = 0; i < dpsoftrast.numthreads; i++)
4952 thread = &dpsoftrast.threads[i];
4953 if (thread->commandoffset != dpsoftrast.drawcommand)
4954 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4957 dpsoftrast.commandpool.usedcommands = 0;
4960 void DPSOFTRAST_Flush(void)
4962 DPSOFTRAST_Draw_FlushThreads();
4965 void DPSOFTRAST_Finish(void)
4970 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
4980 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4981 dpsoftrast.bigendian = u.b[3];
4982 dpsoftrast.fb_width = width;
4983 dpsoftrast.fb_height = height;
4984 dpsoftrast.fb_depthpixels = depthpixels;
4985 dpsoftrast.fb_colorpixels[0] = colorpixels;
4986 dpsoftrast.fb_colorpixels[1] = NULL;
4987 dpsoftrast.fb_colorpixels[1] = NULL;
4988 dpsoftrast.fb_colorpixels[1] = NULL;
4989 dpsoftrast.viewport[0] = 0;
4990 dpsoftrast.viewport[1] = 0;
4991 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
4992 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
4993 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
4994 dpsoftrast.texture_firstfree = 1;
4995 dpsoftrast.texture_end = 1;
4996 dpsoftrast.texture_max = 0;
4997 dpsoftrast.color[0] = 1;
4998 dpsoftrast.color[1] = 1;
4999 dpsoftrast.color[2] = 1;
5000 dpsoftrast.color[3] = 1;
5001 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5002 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5003 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5004 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5005 for (i = 0; i < dpsoftrast.numthreads; i++)
5007 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5009 thread->cullface = GL_BACK;
5010 thread->colormask[1] = 1;
5011 thread->colormask[2] = 1;
5012 thread->colormask[3] = 1;
5013 thread->blendfunc[0] = GL_ONE;
5014 thread->blendfunc[1] = GL_ZERO;
5015 thread->depthmask = true;
5016 thread->depthtest = true;
5017 thread->depthfunc = GL_LEQUAL;
5018 thread->scissortest = false;
5019 thread->alphatest = false;
5020 thread->alphafunc = GL_GREATER;
5021 thread->alphavalue = 0.5f;
5022 thread->viewport[0] = 0;
5023 thread->viewport[1] = 0;
5024 thread->viewport[2] = dpsoftrast.fb_width;
5025 thread->viewport[3] = dpsoftrast.fb_height;
5026 thread->scissor[0] = 0;
5027 thread->scissor[1] = 0;
5028 thread->scissor[2] = dpsoftrast.fb_width;
5029 thread->scissor[3] = dpsoftrast.fb_height;
5030 thread->depthrange[0] = 0;
5031 thread->depthrange[1] = 1;
5032 thread->polygonoffset[0] = 0;
5033 thread->polygonoffset[1] = 0;
5035 if (dpsoftrast.interlace)
5037 thread->miny1 = (i*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5038 thread->maxy1 = ((i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5039 thread->miny2 = ((dpsoftrast.numthreads+i)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5040 thread->maxy2 = ((dpsoftrast.numthreads+i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5044 thread->miny1 = thread->miny2 = (i*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5045 thread->maxy1 = thread->maxy2 = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5048 thread->numspans = 0;
5049 thread->numtriangles = 0;
5050 thread->commandoffset = 0;
5051 thread->waiting = false;
5052 thread->starving = false;
5054 thread->validate = -1;
5055 DPSOFTRAST_Validate(thread, -1);
5057 if (dpsoftrast.usethreads)
5059 thread->waitcond = Thread_CreateCond();
5060 thread->drawcond = Thread_CreateCond();
5061 thread->drawmutex = Thread_CreateMutex();
5062 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5068 void DPSOFTRAST_Shutdown(void)
5071 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5073 DPSOFTRAST_State_Thread *thread;
5074 for (i = 0; i < dpsoftrast.numthreads; i++)
5076 thread = &dpsoftrast.threads[i];
5077 Thread_LockMutex(thread->drawmutex);
5079 Thread_CondSignal(thread->drawcond);
5080 Thread_UnlockMutex(thread->drawmutex);
5081 Thread_WaitThread(thread->thread, 0);
5082 Thread_DestroyCond(thread->waitcond);
5083 Thread_DestroyCond(thread->drawcond);
5084 Thread_DestroyMutex(thread->drawmutex);
5087 for (i = 0;i < dpsoftrast.texture_end;i++)
5088 if (dpsoftrast.texture[i].bytes)
5089 MM_FREE(dpsoftrast.texture[i].bytes);
5090 if (dpsoftrast.texture)
5091 free(dpsoftrast.texture);
5092 if (dpsoftrast.threads)
5093 MM_FREE(dpsoftrast.threads);
5094 memset(&dpsoftrast, 0, sizeof(dpsoftrast));