3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 typedef qboolean bool;
14 #define ATOMIC_SIZE 32
17 #if defined(__APPLE__)
18 #include <libkern/OSAtomic.h>
19 #define ALIGN(var) var __attribute__((__aligned__(16)))
20 #define ATOMIC(var) var __attribute__((__aligned__(32)))
21 #define MEMORY_BARRIER (_mm_sfence())
22 #define ATOMIC_COUNTER volatile int32_t
23 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
24 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
25 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
26 #elif defined(__GNUC__)
27 #define ALIGN(var) var __attribute__((__aligned__(16)))
28 #define ATOMIC(var) var __attribute__((__aligned__(32)))
29 #define MEMORY_BARRIER (_mm_sfence())
30 //(__sync_synchronize())
31 #define ATOMIC_COUNTER volatile int
32 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
33 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
34 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
35 #elif defined(_MSC_VER)
36 #define ALIGN(var) __declspec(align(16)) var
37 #define ATOMIC(var) __declspec(align(32)) var
38 #define MEMORY_BARRIER (_mm_sfence())
40 #define ATOMIC_COUNTER volatile LONG
41 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
42 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
43 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
48 #define ALIGN(var) var
51 #define ATOMIC(var) var
53 #ifndef MEMORY_BARRIER
54 #define MEMORY_BARRIER ((void)0)
56 #ifndef ATOMIC_COUNTER
57 #define ATOMIC_COUNTER int
59 #ifndef ATOMIC_INCREMENT
60 #define ATOMIC_INCREMENT(counter) (++(counter))
62 #ifndef ATOMIC_DECREMENT
63 #define ATOMIC_DECREMENT(counter) (--(counter))
66 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
70 #include <emmintrin.h>
72 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
74 static void *MM_CALLOC(size_t nmemb, size_t size)
76 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
77 if (ptr != NULL) memset(ptr, 0, nmemb*size);
81 #define MM_FREE _mm_free
83 #define MM_MALLOC(size) malloc(size)
84 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
88 typedef enum DPSOFTRAST_ARRAY_e
90 DPSOFTRAST_ARRAY_POSITION,
91 DPSOFTRAST_ARRAY_COLOR,
92 DPSOFTRAST_ARRAY_TEXCOORD0,
93 DPSOFTRAST_ARRAY_TEXCOORD1,
94 DPSOFTRAST_ARRAY_TEXCOORD2,
95 DPSOFTRAST_ARRAY_TEXCOORD3,
96 DPSOFTRAST_ARRAY_TEXCOORD4,
97 DPSOFTRAST_ARRAY_TEXCOORD5,
98 DPSOFTRAST_ARRAY_TEXCOORD6,
99 DPSOFTRAST_ARRAY_TEXCOORD7,
100 DPSOFTRAST_ARRAY_TOTAL
104 typedef struct DPSOFTRAST_Texture_s
111 DPSOFTRAST_TEXTURE_FILTER filter;
114 ATOMIC_COUNTER binds;
115 unsigned char *bytes;
116 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
120 #define COMMAND_SIZE ALIGN_SIZE
121 #define COMMAND_ALIGN(var) ALIGN(var)
123 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
125 unsigned char opcode;
126 unsigned short commandsize;
130 enum { DPSOFTRAST_OPCODE_Reset = 0 };
132 #define DEFCOMMAND(opcodeval, name, fields) \
133 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
134 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
136 unsigned char opcode; \
137 unsigned short commandsize; \
139 } DPSOFTRAST_Command_##name );
141 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
142 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
144 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
148 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
150 DPSOFTRAST_State_Command_Pool);
152 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
154 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
156 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
158 DPSOFTRAST_State_Triangle);
160 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
161 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
162 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
163 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
164 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
166 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
167 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
168 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
169 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
170 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
171 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
172 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
173 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
174 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
177 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
179 typedef ALIGN(struct DPSOFTRAST_State_Span_s
181 int triangle; // triangle this span was generated by
182 int x; // framebuffer x coord
183 int y; // framebuffer y coord
184 int startx; // usable range (according to pixelmask)
185 int endx; // usable range (according to pixelmask)
186 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
188 DPSOFTRAST_State_Span);
190 #define DPSOFTRAST_DRAW_MAXSPANS 1024
191 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
193 #define DPSOFTRAST_VALIDATE_FB 1
194 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
195 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
196 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
198 typedef enum DPSOFTRAST_BLENDMODE_e
200 DPSOFTRAST_BLENDMODE_OPAQUE,
201 DPSOFTRAST_BLENDMODE_ALPHA,
202 DPSOFTRAST_BLENDMODE_ADDALPHA,
203 DPSOFTRAST_BLENDMODE_ADD,
204 DPSOFTRAST_BLENDMODE_INVMOD,
205 DPSOFTRAST_BLENDMODE_MUL,
206 DPSOFTRAST_BLENDMODE_MUL2,
207 DPSOFTRAST_BLENDMODE_SUBALPHA,
208 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
209 DPSOFTRAST_BLENDMODE_INVADD,
210 DPSOFTRAST_BLENDMODE_TOTAL
212 DPSOFTRAST_BLENDMODE;
214 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
233 float polygonoffset[2];
236 int shader_permutation;
238 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
240 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
241 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
243 // DPSOFTRAST_VALIDATE_ flags
246 // derived values (DPSOFTRAST_VALIDATE_FB)
249 ALIGN(float fb_viewportcenter[4]);
250 ALIGN(float fb_viewportscale[4]);
252 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
255 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
264 ATOMIC(volatile int commandoffset);
266 volatile bool waiting;
267 volatile bool starving;
274 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
275 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
277 DPSOFTRAST_State_Thread);
279 typedef ATOMIC(struct DPSOFTRAST_State_s
283 unsigned int *fb_depthpixels;
284 unsigned int *fb_colorpixels[4];
287 ALIGN(float fb_viewportcenter[4]);
288 ALIGN(float fb_viewportscale[4]);
291 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
292 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
294 const float *pointer_vertex3f;
295 const float *pointer_color4f;
296 const unsigned char *pointer_color4ub;
297 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
300 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
301 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
302 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
306 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
307 float *screencoord4f;
313 int shader_permutation;
317 int texture_firstfree;
318 DPSOFTRAST_Texture *texture;
323 const char *errorstring;
328 DPSOFTRAST_State_Thread *threads;
330 ATOMIC(volatile int drawcommand);
332 DPSOFTRAST_State_Command_Pool commandpool;
336 DPSOFTRAST_State dpsoftrast;
338 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
339 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
340 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
341 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
342 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
344 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
346 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
347 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
348 fb_viewportcenter[3] = 0.5f;
349 fb_viewportcenter[0] = 0.0f;
350 fb_viewportscale[1] = 0.5f * viewport[2];
351 fb_viewportscale[2] = -0.5f * viewport[3];
352 fb_viewportscale[3] = 0.5f;
353 fb_viewportscale[0] = 1.0f;
356 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
358 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
359 // and viewport projection values
362 x1 = thread->scissor[0];
363 x2 = thread->scissor[0] + thread->scissor[2];
364 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
365 y2 = dpsoftrast.fb_height - thread->scissor[1];
366 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
368 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
370 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
371 thread->fb_scissor[0] = x1;
372 thread->fb_scissor[1] = y1;
373 thread->fb_scissor[2] = x2 - x1;
374 thread->fb_scissor[3] = y2 - y1;
376 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
379 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
381 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
384 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
386 if (thread->blendsubtract)
388 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
390 #define BLENDFUNC(sfactor, dfactor, blendmode) \
391 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
392 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
393 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
398 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
400 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
401 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
402 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
403 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
404 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
405 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
406 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
407 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
408 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
409 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
410 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
415 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
417 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
419 mask &= thread->validate;
422 if (mask & DPSOFTRAST_VALIDATE_FB)
424 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
425 DPSOFTRAST_RecalcFB(thread);
427 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
429 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
430 DPSOFTRAST_RecalcDepthFunc(thread);
432 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
434 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
435 DPSOFTRAST_RecalcBlendFunc(thread);
439 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
441 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
442 return &dpsoftrast.texture[index];
446 static void DPSOFTRAST_Texture_Grow(void)
448 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
449 DPSOFTRAST_State_Thread *thread;
453 // expand texture array as needed
454 if (dpsoftrast.texture_max < 1024)
455 dpsoftrast.texture_max = 1024;
457 dpsoftrast.texture_max *= 2;
458 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
459 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
460 if (dpsoftrast.texbound[i])
461 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
462 for (j = 0; j < dpsoftrast.numthreads; j++)
464 thread = &dpsoftrast.threads[j];
465 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
466 if (thread->texbound[i])
467 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
471 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
480 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
481 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
482 DPSOFTRAST_Texture *texture;
483 if (width*height*depth < 1)
485 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
488 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
490 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
495 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
496 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
497 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
499 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
500 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
502 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
507 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
510 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
512 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
517 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
519 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
522 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
524 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
527 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
529 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
532 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
534 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
537 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
539 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
542 // find first empty slot in texture array
543 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
544 if (!dpsoftrast.texture[texnum].bytes)
546 dpsoftrast.texture_firstfree = texnum + 1;
547 if (dpsoftrast.texture_max <= texnum)
548 DPSOFTRAST_Texture_Grow();
549 if (dpsoftrast.texture_end <= texnum)
550 dpsoftrast.texture_end = texnum + 1;
551 texture = &dpsoftrast.texture[texnum];
552 memset(texture, 0, sizeof(*texture));
553 texture->flags = flags;
554 texture->width = width;
555 texture->height = height;
556 texture->depth = depth;
557 texture->sides = sides;
569 s = w * h * d * sides * 4;
570 texture->mipmap[mipmaps][0] = size;
571 texture->mipmap[mipmaps][1] = s;
572 texture->mipmap[mipmaps][2] = w;
573 texture->mipmap[mipmaps][3] = h;
574 texture->mipmap[mipmaps][4] = d;
577 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
583 texture->mipmaps = mipmaps;
584 texture->size = size;
586 // allocate the pixels now
587 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
591 void DPSOFTRAST_Texture_Free(int index)
593 DPSOFTRAST_Texture *texture;
594 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
598 MM_FREE(texture->bytes);
599 texture->bytes = NULL;
600 memset(texture, 0, sizeof(*texture));
601 // adjust the free range and used range
602 if (dpsoftrast.texture_firstfree > index)
603 dpsoftrast.texture_firstfree = index;
604 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
605 dpsoftrast.texture_end--;
607 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
609 int i, x, y, z, w, layer0, layer1, row0, row1;
610 unsigned char *o, *i0, *i1, *i2, *i3;
611 DPSOFTRAST_Texture *texture;
612 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
613 if (texture->mipmaps <= 1)
615 for (i = 1;i < texture->mipmaps;i++)
617 for (z = 0;z < texture->mipmap[i][4];z++)
621 if (layer1 >= texture->mipmap[i-1][4])
622 layer1 = texture->mipmap[i-1][4]-1;
623 for (y = 0;y < texture->mipmap[i][3];y++)
627 if (row1 >= texture->mipmap[i-1][3])
628 row1 = texture->mipmap[i-1][3]-1;
629 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
630 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
631 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
632 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
633 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
634 w = texture->mipmap[i][2];
637 if (texture->mipmap[i-1][2] > 1)
639 // average 3D texture
640 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
642 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
643 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
644 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
645 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
650 // average 3D mipmap with parent width == 1
651 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
653 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
654 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
655 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
656 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
662 if (texture->mipmap[i-1][2] > 1)
664 // average 2D texture (common case)
665 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
667 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
668 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
669 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
670 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
675 // 2D texture with parent width == 1
676 o[0] = (i0[0] + i1[0] + 1) >> 1;
677 o[1] = (i0[1] + i1[1] + 1) >> 1;
678 o[2] = (i0[2] + i1[2] + 1) >> 1;
679 o[3] = (i0[3] + i1[3] + 1) >> 1;
686 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
688 DPSOFTRAST_Texture *texture;
690 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
693 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
694 while (blockheight > 0)
696 memcpy(dst, pixels, blockwidth * 4);
697 pixels += blockwidth * 4;
698 dst += texture->mipmap[0][2] * 4;
701 DPSOFTRAST_Texture_CalculateMipmaps(index);
703 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
705 DPSOFTRAST_Texture *texture;
706 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
709 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
710 DPSOFTRAST_Texture_CalculateMipmaps(index);
712 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
714 DPSOFTRAST_Texture *texture;
715 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
716 return texture->mipmap[mip][2];
718 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
720 DPSOFTRAST_Texture *texture;
721 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
722 return texture->mipmap[mip][3];
724 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
726 DPSOFTRAST_Texture *texture;
727 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
728 return texture->mipmap[mip][4];
730 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
732 DPSOFTRAST_Texture *texture;
733 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
736 return texture->bytes + texture->mipmap[mip][0];
738 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
740 DPSOFTRAST_Texture *texture;
741 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
742 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
744 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
749 texture->filter = filter;
752 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
754 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
755 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
756 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
758 dpsoftrast.fb_width = width;
759 dpsoftrast.fb_height = height;
760 dpsoftrast.fb_depthpixels = depthpixels;
761 dpsoftrast.fb_colorpixels[0] = colorpixels0;
762 dpsoftrast.fb_colorpixels[1] = colorpixels1;
763 dpsoftrast.fb_colorpixels[2] = colorpixels2;
764 dpsoftrast.fb_colorpixels[3] = colorpixels3;
767 static void DPSOFTRAST_Draw_FlushThreads(void);
769 static void DPSOFTRAST_Draw_SyncCommands(void)
771 if(dpsoftrast.usethreads) MEMORY_BARRIER;
772 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
775 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
777 DPSOFTRAST_State_Thread *thread;
779 int freecommand = dpsoftrast.commandpool.freecommand;
780 int usedcommands = dpsoftrast.commandpool.usedcommands;
781 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
783 DPSOFTRAST_Draw_SyncCommands();
789 for (i = 0; i < dpsoftrast.numthreads; i++)
791 thread = &dpsoftrast.threads[i];
792 commandoffset = freecommand - thread->commandoffset;
793 if (commandoffset < 0)
794 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
795 if (commandoffset > usedcommands)
798 usedcommands = commandoffset;
801 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
803 thread = &dpsoftrast.threads[waitindex];
804 Thread_LockMutex(thread->drawmutex);
805 if (thread->commandoffset != dpsoftrast.drawcommand)
807 thread->waiting = true;
808 if (thread->starving) Thread_CondSignal(thread->drawcond);
809 Thread_CondWait(thread->waitcond, thread->drawmutex);
810 thread->waiting = false;
812 Thread_UnlockMutex(thread->drawmutex);
814 dpsoftrast.commandpool.usedcommands = usedcommands;
817 #define DPSOFTRAST_ALIGNCOMMAND(size) \
818 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
819 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
820 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
822 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
824 DPSOFTRAST_Command *command;
825 int freecommand = dpsoftrast.commandpool.freecommand;
826 int usedcommands = dpsoftrast.commandpool.usedcommands;
827 int extra = sizeof(DPSOFTRAST_Command);
828 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
829 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
830 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
832 if (dpsoftrast.usethreads)
833 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
835 DPSOFTRAST_Draw_FlushThreads();
836 freecommand = dpsoftrast.commandpool.freecommand;
837 usedcommands = dpsoftrast.commandpool.usedcommands;
839 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
841 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
842 command->opcode = DPSOFTRAST_OPCODE_Reset;
843 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
846 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
847 command->opcode = opcode;
848 command->commandsize = size;
850 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
852 dpsoftrast.commandpool.freecommand = freecommand;
853 dpsoftrast.commandpool.usedcommands = usedcommands + size;
857 static void DPSOFTRAST_UndoCommand(int size)
859 int freecommand = dpsoftrast.commandpool.freecommand;
860 int usedcommands = dpsoftrast.commandpool.usedcommands;
863 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
864 usedcommands -= size;
865 dpsoftrast.commandpool.freecommand = freecommand;
866 dpsoftrast.commandpool.usedcommands = usedcommands;
869 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
870 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
872 thread->viewport[0] = command->x;
873 thread->viewport[1] = command->y;
874 thread->viewport[2] = command->width;
875 thread->viewport[3] = command->height;
876 thread->validate |= DPSOFTRAST_VALIDATE_FB;
878 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
880 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
883 command->width = width;
884 command->height = height;
886 dpsoftrast.viewport[0] = x;
887 dpsoftrast.viewport[1] = y;
888 dpsoftrast.viewport[2] = width;
889 dpsoftrast.viewport[3] = height;
890 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
893 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
894 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
896 int i, x1, y1, x2, y2, w, h, x, y;
897 int miny1 = thread->miny1;
898 int maxy1 = thread->maxy1;
899 int miny2 = thread->miny2;
900 int maxy2 = thread->maxy2;
904 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
905 x1 = thread->fb_scissor[0];
906 y1 = thread->fb_scissor[1];
907 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
908 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
909 if (y1 < miny1) y1 = miny1;
910 if (y2 > maxy2) y2 = maxy2;
915 // FIXME: honor fb_colormask?
916 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
917 for (i = 0;i < 4;i++)
919 if (!dpsoftrast.fb_colorpixels[i])
921 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
924 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
925 for (x = x1;x < x2;x++)
930 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
932 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
939 DEFCOMMAND(3, ClearDepth, float depth;)
940 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
942 int x1, y1, x2, y2, w, h, x, y;
943 int miny1 = thread->miny1;
944 int maxy1 = thread->maxy1;
945 int miny2 = thread->miny2;
946 int maxy2 = thread->maxy2;
950 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
951 x1 = thread->fb_scissor[0];
952 y1 = thread->fb_scissor[1];
953 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
954 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
955 if (y1 < miny1) y1 = miny1;
956 if (y2 > maxy2) y2 = maxy2;
961 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
962 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
965 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
966 for (x = x1;x < x2;x++)
970 void DPSOFTRAST_ClearDepth(float d)
972 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
976 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
977 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
979 thread->colormask[0] = command->r != 0;
980 thread->colormask[1] = command->g != 0;
981 thread->colormask[2] = command->b != 0;
982 thread->colormask[3] = command->a != 0;
983 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
985 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
987 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
994 DEFCOMMAND(5, DepthTest, int enable;)
995 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
997 thread->depthtest = command->enable;
998 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1000 void DPSOFTRAST_DepthTest(int enable)
1002 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1003 command->enable = enable;
1006 DEFCOMMAND(6, ScissorTest, int enable;)
1007 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1009 thread->scissortest = command->enable;
1010 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1012 void DPSOFTRAST_ScissorTest(int enable)
1014 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1015 command->enable = enable;
1018 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1019 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1021 thread->scissor[0] = command->x;
1022 thread->scissor[1] = command->y;
1023 thread->scissor[2] = command->width;
1024 thread->scissor[3] = command->height;
1025 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1027 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1029 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1032 command->width = width;
1033 command->height = height;
1036 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1037 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1039 thread->blendfunc[0] = command->sfactor;
1040 thread->blendfunc[1] = command->dfactor;
1041 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1043 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1045 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1046 command->sfactor = sfactor;
1047 command->dfactor = dfactor;
1050 DEFCOMMAND(9, BlendSubtract, int enable;)
1051 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1053 thread->blendsubtract = command->enable;
1054 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1056 void DPSOFTRAST_BlendSubtract(int enable)
1058 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1059 command->enable = enable;
1062 DEFCOMMAND(10, DepthMask, int enable;)
1063 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1065 thread->depthmask = command->enable;
1067 void DPSOFTRAST_DepthMask(int enable)
1069 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1070 command->enable = enable;
1073 DEFCOMMAND(11, DepthFunc, int func;)
1074 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1076 thread->depthfunc = command->func;
1078 void DPSOFTRAST_DepthFunc(int func)
1080 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1081 command->func = func;
1084 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1085 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1087 thread->depthrange[0] = command->nearval;
1088 thread->depthrange[1] = command->farval;
1090 void DPSOFTRAST_DepthRange(float nearval, float farval)
1092 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1093 command->nearval = nearval;
1094 command->farval = farval;
1097 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1098 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1100 thread->polygonoffset[0] = command->alongnormal;
1101 thread->polygonoffset[1] = command->intoview;
1103 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1105 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1106 command->alongnormal = alongnormal;
1107 command->intoview = intoview;
1110 DEFCOMMAND(14, CullFace, int mode;)
1111 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1113 thread->cullface = command->mode;
1115 void DPSOFTRAST_CullFace(int mode)
1117 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1118 command->mode = mode;
1121 DEFCOMMAND(15, AlphaTest, int enable;)
1122 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1124 thread->alphatest = command->enable;
1126 void DPSOFTRAST_AlphaTest(int enable)
1128 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1129 command->enable = enable;
1132 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1133 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1135 thread->alphafunc = command->func;
1136 thread->alphavalue = command->ref;
1138 void DPSOFTRAST_AlphaFunc(int func, float ref)
1140 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1141 command->func = func;
1145 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1147 dpsoftrast.color[0] = r;
1148 dpsoftrast.color[1] = g;
1149 dpsoftrast.color[2] = b;
1150 dpsoftrast.color[3] = a;
1153 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1155 int outstride = blockwidth * 4;
1156 int instride = dpsoftrast.fb_width * 4;
1159 int bx2 = blockx + blockwidth;
1160 int by2 = blocky + blockheight;
1164 unsigned char *inpixels;
1168 if (bx1 < 0) bx1 = 0;
1169 if (by1 < 0) by1 = 0;
1170 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1171 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1173 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1174 if (dpsoftrast.bigendian)
1176 for (y = by1;y < by2;y++)
1178 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1179 o = (unsigned char *)outpixels + (y - by1) * outstride;
1180 for (x = bx1;x < bx2;x++)
1193 for (y = by1;y < by2;y++)
1195 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1196 o = (unsigned char *)outpixels + (y - by1) * outstride;
1202 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1206 int tx2 = tx + width;
1207 int ty2 = ty + height;
1210 int sx2 = sx + width;
1211 int sy2 = sy + height;
1221 unsigned int *spixels;
1222 unsigned int *tpixels;
1223 DPSOFTRAST_Texture *texture;
1224 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1225 if (mip < 0 || mip >= texture->mipmaps) return;
1227 spixels = dpsoftrast.fb_colorpixels[0];
1228 swidth = dpsoftrast.fb_width;
1229 sheight = dpsoftrast.fb_height;
1230 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1231 twidth = texture->mipmap[mip][2];
1232 theight = texture->mipmap[mip][3];
1233 if (tx1 < 0) tx1 = 0;
1234 if (ty1 < 0) ty1 = 0;
1235 if (tx2 > twidth) tx2 = twidth;
1236 if (ty2 > theight) ty2 = theight;
1237 if (sx1 < 0) sx1 = 0;
1238 if (sy1 < 0) sy1 = 0;
1239 if (sx2 > swidth) sx2 = swidth;
1240 if (sy2 > sheight) sy2 = sheight;
1245 if (tw > sw) tw = sw;
1246 if (th > sh) th = sh;
1247 if (tw < 1 || th < 1)
1249 for (y = 0;y < th;y++)
1250 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1251 if (texture->mipmaps > 1)
1252 DPSOFTRAST_Texture_CalculateMipmaps(index);
1255 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1256 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1258 if (thread->texbound[command->unitnum])
1259 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1260 thread->texbound[command->unitnum] = command->texture;
1262 void DPSOFTRAST_SetTexture(int unitnum, int index)
1264 DPSOFTRAST_Command_SetTexture *command;
1265 DPSOFTRAST_Texture *texture;
1266 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1268 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1271 texture = DPSOFTRAST_Texture_GetByIndex(index);
1272 if (index && !texture)
1274 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1278 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1279 command->unitnum = unitnum;
1280 command->texture = texture;
1282 dpsoftrast.texbound[unitnum] = texture;
1283 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1286 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1288 dpsoftrast.pointer_vertex3f = vertex3f;
1289 dpsoftrast.stride_vertex = stride;
1291 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1293 dpsoftrast.pointer_color4f = color4f;
1294 dpsoftrast.pointer_color4ub = NULL;
1295 dpsoftrast.stride_color = stride;
1297 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1299 dpsoftrast.pointer_color4f = NULL;
1300 dpsoftrast.pointer_color4ub = color4ub;
1301 dpsoftrast.stride_color = stride;
1303 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1305 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1306 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1307 dpsoftrast.stride_texcoord[unitnum] = stride;
1310 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1311 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1313 thread->shader_mode = command->mode;
1314 thread->shader_permutation = command->permutation;
1316 void DPSOFTRAST_SetShader(int mode, int permutation)
1318 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1319 command->mode = mode;
1320 command->permutation = permutation;
1322 dpsoftrast.shader_mode = mode;
1323 dpsoftrast.shader_permutation = permutation;
1326 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1327 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1329 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1331 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1333 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1334 command->index = index;
1335 command->val[0] = v0;
1336 command->val[1] = v1;
1337 command->val[2] = v2;
1338 command->val[3] = v3;
1340 dpsoftrast.uniform4f[index*4+0] = v0;
1341 dpsoftrast.uniform4f[index*4+1] = v1;
1342 dpsoftrast.uniform4f[index*4+2] = v2;
1343 dpsoftrast.uniform4f[index*4+3] = v3;
1345 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1347 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1348 command->index = index;
1349 memcpy(command->val, v, sizeof(command->val));
1351 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1354 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1355 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1357 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1359 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1363 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1365 __m128 m0, m1, m2, m3;
1366 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1367 command->index = (DPSOFTRAST_UNIFORM)index;
1368 if (((size_t)v)&(ALIGN_SIZE-1))
1370 m0 = _mm_loadu_ps(v);
1371 m1 = _mm_loadu_ps(v+4);
1372 m2 = _mm_loadu_ps(v+8);
1373 m3 = _mm_loadu_ps(v+12);
1377 m0 = _mm_load_ps(v);
1378 m1 = _mm_load_ps(v+4);
1379 m2 = _mm_load_ps(v+8);
1380 m3 = _mm_load_ps(v+12);
1384 __m128 t0, t1, t2, t3;
1385 t0 = _mm_unpacklo_ps(m0, m1);
1386 t1 = _mm_unpacklo_ps(m2, m3);
1387 t2 = _mm_unpackhi_ps(m0, m1);
1388 t3 = _mm_unpackhi_ps(m2, m3);
1389 m0 = _mm_movelh_ps(t0, t1);
1390 m1 = _mm_movehl_ps(t1, t0);
1391 m2 = _mm_movelh_ps(t2, t3);
1392 m3 = _mm_movehl_ps(t3, t2);
1394 _mm_store_ps(command->val, m0);
1395 _mm_store_ps(command->val+4, m1);
1396 _mm_store_ps(command->val+8, m2);
1397 _mm_store_ps(command->val+12, m3);
1398 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1399 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1400 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1401 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1406 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1407 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1409 thread->uniform1i[command->index] = command->val;
1411 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1413 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1414 command->index = index;
1417 dpsoftrast.uniform1i[command->index] = i0;
1421 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1423 float *end = dst + size*4;
1424 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1428 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1437 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1444 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1446 float *end = dst + size*4;
1447 if (stride == sizeof(float[3]))
1449 float *end4 = dst + (size&~3)*4;
1450 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1454 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1455 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1456 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1457 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1458 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1459 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1460 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1461 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1462 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1463 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1464 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1465 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1466 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1468 src += 4*sizeof(float[3]);
1475 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1476 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1477 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1478 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1479 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1480 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1481 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1482 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1483 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1484 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1485 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1486 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1487 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1489 src += 4*sizeof(float[3]);
1493 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1497 __m128 v = _mm_loadu_ps((const float *)src);
1498 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1499 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1500 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1501 _mm_store_ps(dst, v);
1510 __m128 v = _mm_load_ps((const float *)src);
1511 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1512 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1513 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1514 _mm_store_ps(dst, v);
1521 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1523 float *end = dst + size*4;
1524 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1525 if (stride == sizeof(float[2]))
1527 float *end2 = dst + (size&~1)*4;
1528 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1532 __m128 v = _mm_loadu_ps((const float *)src);
1533 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1534 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1536 src += 2*sizeof(float[2]);
1543 __m128 v = _mm_load_ps((const float *)src);
1544 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1545 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1547 src += 2*sizeof(float[2]);
1553 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1559 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1561 float *end = dst + size*4;
1562 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1563 if (stride == sizeof(unsigned char[4]))
1565 float *end4 = dst + (size&~3)*4;
1566 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1570 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1571 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1572 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1573 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1574 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1576 src += 4*sizeof(unsigned char[4]);
1583 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1584 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1585 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1586 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1587 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1589 src += 4*sizeof(unsigned char[4]);
1595 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1596 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1602 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1604 float *end = dst + 4*size;
1605 __m128 v = _mm_loadu_ps(src);
1608 _mm_store_ps(dst, v);
1614 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1617 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1618 __m128 m0, m1, m2, m3;
1620 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1622 // fast case for identity matrix
1623 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1626 end = out4f + numitems*4;
1627 m0 = _mm_loadu_ps(inmatrix16f);
1628 m1 = _mm_loadu_ps(inmatrix16f + 4);
1629 m2 = _mm_loadu_ps(inmatrix16f + 8);
1630 m3 = _mm_loadu_ps(inmatrix16f + 12);
1631 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1635 __m128 v = _mm_loadu_ps(in4f);
1637 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1638 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1639 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1640 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1649 __m128 v = _mm_load_ps(in4f);
1651 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1652 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1653 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1654 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1662 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1664 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1668 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1670 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1671 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1672 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1673 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1676 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1678 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1679 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1680 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1681 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1684 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1687 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1688 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1689 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1690 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1693 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1695 int clipmask = 0xFF;
1696 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1697 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1698 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1699 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1700 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1701 #define BBFRONT(k, pos) \
1703 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1704 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1705 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1708 clipmask &= ~(1<<k); \
1709 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1710 minproj = _mm_min_ss(minproj, proj); \
1711 maxproj = _mm_max_ss(maxproj, proj); \
1715 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1716 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1717 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1718 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1719 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1720 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1724 if (clipmask&(1<<k)) \
1726 if (!(clipmask&(1<<(k^1)))) \
1728 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1729 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1730 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1731 minproj = _mm_min_ss(minproj, proj); \
1732 maxproj = _mm_max_ss(maxproj, proj); \
1734 if (!(clipmask&(1<<(k^2)))) \
1736 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1737 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1738 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1739 minproj = _mm_min_ss(minproj, proj); \
1740 maxproj = _mm_max_ss(maxproj, proj); \
1742 if (!(clipmask&(1<<(k^4)))) \
1744 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1745 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1746 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1747 minproj = _mm_min_ss(minproj, proj); \
1748 maxproj = _mm_max_ss(maxproj, proj); \
1752 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1753 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1754 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1755 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1756 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1757 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1758 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1759 *starty = _mm_cvttss_si32(maxproj);
1760 *endy = _mm_cvttss_si32(minproj)+1;
1764 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1766 float *end = out4f + numitems*4;
1767 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1768 __m128 minpos, maxpos;
1769 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1771 minpos = maxpos = _mm_loadu_ps(in4f);
1774 __m128 v = _mm_loadu_ps(in4f);
1775 minpos = _mm_min_ps(minpos, v);
1776 maxpos = _mm_max_ps(maxpos, v);
1777 _mm_store_ps(out4f, v);
1778 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1779 _mm_store_ps(screen4f, v);
1787 minpos = maxpos = _mm_load_ps(in4f);
1790 __m128 v = _mm_load_ps(in4f);
1791 minpos = _mm_min_ps(minpos, v);
1792 maxpos = _mm_max_ps(maxpos, v);
1793 _mm_store_ps(out4f, v);
1794 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1795 _mm_store_ps(screen4f, v);
1802 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale,
1803 _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1804 _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1805 _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1806 _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1810 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1812 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1813 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1815 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1816 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1817 end = out4f + numitems*4;
1818 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1819 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1820 m0 = _mm_loadu_ps(inmatrix16f);
1821 m1 = _mm_loadu_ps(inmatrix16f + 4);
1822 m2 = _mm_loadu_ps(inmatrix16f + 8);
1823 m3 = _mm_loadu_ps(inmatrix16f + 12);
1824 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1826 minpos = maxpos = _mm_loadu_ps(in4f);
1829 __m128 v = _mm_loadu_ps(in4f);
1830 minpos = _mm_min_ps(minpos, v);
1831 maxpos = _mm_max_ps(maxpos, v);
1832 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1833 _mm_store_ps(out4f, v);
1834 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1835 _mm_store_ps(screen4f, v);
1843 minpos = maxpos = _mm_load_ps(in4f);
1846 __m128 v = _mm_load_ps(in4f);
1847 minpos = _mm_min_ps(minpos, v);
1848 maxpos = _mm_max_ps(maxpos, v);
1849 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1850 _mm_store_ps(out4f, v);
1851 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1852 _mm_store_ps(screen4f, v);
1859 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3);
1864 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1867 float *outf = dpsoftrast.post_array4f[outarray];
1868 const unsigned char *inb;
1869 int firstvertex = dpsoftrast.firstvertex;
1870 int numvertices = dpsoftrast.numvertices;
1874 case DPSOFTRAST_ARRAY_POSITION:
1875 stride = dpsoftrast.stride_vertex;
1876 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1877 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1879 case DPSOFTRAST_ARRAY_COLOR:
1880 stride = dpsoftrast.stride_color;
1881 if (dpsoftrast.pointer_color4f)
1883 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1884 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1886 else if (dpsoftrast.pointer_color4ub)
1888 stride = dpsoftrast.stride_color;
1889 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1890 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1894 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1898 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1899 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1901 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1902 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1905 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1908 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1911 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1923 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1925 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1926 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1931 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1934 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1935 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1943 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1946 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1947 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1954 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1957 int startx = span->startx;
1958 int endx = span->endx;
1959 float wslope = triangle->w[0];
1960 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1961 float endz = 1.0f / (w + wslope * startx);
1962 for (x = startx;x < endx;)
1964 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1966 if (nextsub >= endx) nextsub = endsub = endx-1;
1967 endz = 1.0f / (w + wslope * nextsub);
1968 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1969 for (; x <= endsub; x++, z += dz)
1974 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1977 int startx = span->startx;
1978 int endx = span->endx;
1981 unsigned char * RESTRICT pixelmask = span->pixelmask;
1982 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1985 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1986 // handle alphatest now (this affects depth writes too)
1987 if (thread->alphatest)
1988 for (x = startx;x < endx;x++)
1989 if (in4f[x*4+3] < 0.5f)
1990 pixelmask[x] = false;
1991 // FIXME: this does not handle bigendian
1992 switch(thread->fb_blendmode)
1994 case DPSOFTRAST_BLENDMODE_OPAQUE:
1995 for (x = startx;x < endx;x++)
1999 d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
2000 d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
2001 d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
2002 d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
2003 pixel[x*4+0] = d[0];
2004 pixel[x*4+1] = d[1];
2005 pixel[x*4+2] = d[2];
2006 pixel[x*4+3] = d[3];
2009 case DPSOFTRAST_BLENDMODE_ALPHA:
2010 for (x = startx;x < endx;x++)
2014 a = in4f[x*4+3] * 255.0f;
2015 b = 1.0f - in4f[x*4+3];
2016 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2017 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2018 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2019 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2020 pixel[x*4+0] = d[0];
2021 pixel[x*4+1] = d[1];
2022 pixel[x*4+2] = d[2];
2023 pixel[x*4+3] = d[3];
2026 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2027 for (x = startx;x < endx;x++)
2031 a = in4f[x*4+3] * 255.0f;
2032 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2033 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2034 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2035 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2036 pixel[x*4+0] = d[0];
2037 pixel[x*4+1] = d[1];
2038 pixel[x*4+2] = d[2];
2039 pixel[x*4+3] = d[3];
2042 case DPSOFTRAST_BLENDMODE_ADD:
2043 for (x = startx;x < endx;x++)
2047 d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2048 d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2049 d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2050 d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2051 pixel[x*4+0] = d[0];
2052 pixel[x*4+1] = d[1];
2053 pixel[x*4+2] = d[2];
2054 pixel[x*4+3] = d[3];
2057 case DPSOFTRAST_BLENDMODE_INVMOD:
2058 for (x = startx;x < endx;x++)
2062 d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2063 d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2064 d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2065 d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2066 pixel[x*4+0] = d[0];
2067 pixel[x*4+1] = d[1];
2068 pixel[x*4+2] = d[2];
2069 pixel[x*4+3] = d[3];
2072 case DPSOFTRAST_BLENDMODE_MUL:
2073 for (x = startx;x < endx;x++)
2077 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2078 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2079 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2080 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2081 pixel[x*4+0] = d[0];
2082 pixel[x*4+1] = d[1];
2083 pixel[x*4+2] = d[2];
2084 pixel[x*4+3] = d[3];
2087 case DPSOFTRAST_BLENDMODE_MUL2:
2088 for (x = startx;x < endx;x++)
2092 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2093 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2094 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2095 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2096 pixel[x*4+0] = d[0];
2097 pixel[x*4+1] = d[1];
2098 pixel[x*4+2] = d[2];
2099 pixel[x*4+3] = d[3];
2102 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2103 for (x = startx;x < endx;x++)
2107 a = in4f[x*4+3] * -255.0f;
2108 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2109 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2110 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2111 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2112 pixel[x*4+0] = d[0];
2113 pixel[x*4+1] = d[1];
2114 pixel[x*4+2] = d[2];
2115 pixel[x*4+3] = d[3];
2118 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2119 for (x = startx;x < endx;x++)
2124 b = 1.0f - in4f[x*4+3];
2125 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2126 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2127 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2128 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2129 pixel[x*4+0] = d[0];
2130 pixel[x*4+1] = d[1];
2131 pixel[x*4+2] = d[2];
2132 pixel[x*4+3] = d[3];
2135 case DPSOFTRAST_BLENDMODE_INVADD:
2136 for (x = startx;x < endx;x++)
2140 d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2141 d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2142 d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2143 d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2144 pixel[x*4+0] = d[0];
2145 pixel[x*4+1] = d[1];
2146 pixel[x*4+2] = d[2];
2147 pixel[x*4+3] = d[3];
2153 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2157 int startx = span->startx;
2158 int endx = span->endx;
2159 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2160 unsigned char * RESTRICT pixelmask = span->pixelmask;
2161 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2162 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2165 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2166 pixeli += span->y * dpsoftrast.fb_width + span->x;
2167 // handle alphatest now (this affects depth writes too)
2168 if (thread->alphatest)
2169 for (x = startx;x < endx;x++)
2170 if (in4ub[x*4+3] < 0.5f)
2171 pixelmask[x] = false;
2172 // FIXME: this does not handle bigendian
2173 switch(thread->fb_blendmode)
2175 case DPSOFTRAST_BLENDMODE_OPAQUE:
2176 for (x = startx;x + 4 <= endx;)
2178 if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2180 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2194 case DPSOFTRAST_BLENDMODE_ALPHA:
2195 #define FINISHBLEND(blend2, blend1) \
2196 for (x = startx;x + 1 < endx;x += 2) \
2199 switch (*(const unsigned short*)&pixelmask[x]) \
2202 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2203 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2205 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2208 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2209 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2211 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2214 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2215 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2217 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2222 for(;x < endx; x++) \
2225 if (!pixelmask[x]) \
2227 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2228 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2230 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2234 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2235 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2237 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2238 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2241 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2243 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2244 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2246 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2247 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2250 case DPSOFTRAST_BLENDMODE_ADD:
2251 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2253 case DPSOFTRAST_BLENDMODE_INVMOD:
2255 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2257 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2260 case DPSOFTRAST_BLENDMODE_MUL:
2261 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2263 case DPSOFTRAST_BLENDMODE_MUL2:
2264 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2266 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2268 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2269 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2271 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2272 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2275 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2277 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2278 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2280 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2281 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2284 case DPSOFTRAST_BLENDMODE_INVADD:
2286 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2288 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2295 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2298 int startx = span->startx;
2299 int endx = span->endx;
2304 float tc[2], endtc[2];
2306 unsigned int tci[2];
2307 unsigned int tci1[2];
2308 unsigned int tcimin[2];
2309 unsigned int tcimax[2];
2314 const unsigned char * RESTRICT pixelbase;
2315 const unsigned char * RESTRICT pixel[4];
2316 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2317 // if no texture is bound, just fill it with white
2320 for (x = startx;x < endx;x++)
2322 out4f[x*4+0] = 1.0f;
2323 out4f[x*4+1] = 1.0f;
2324 out4f[x*4+2] = 1.0f;
2325 out4f[x*4+3] = 1.0f;
2329 mip = triangle->mip[texunitindex];
2330 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2331 // if this mipmap of the texture is 1 pixel, just fill it with that color
2332 if (texture->mipmap[mip][1] == 4)
2334 c[0] = texture->bytes[2] * (1.0f/255.0f);
2335 c[1] = texture->bytes[1] * (1.0f/255.0f);
2336 c[2] = texture->bytes[0] * (1.0f/255.0f);
2337 c[3] = texture->bytes[3] * (1.0f/255.0f);
2338 for (x = startx;x < endx;x++)
2340 out4f[x*4+0] = c[0];
2341 out4f[x*4+1] = c[1];
2342 out4f[x*4+2] = c[2];
2343 out4f[x*4+3] = c[3];
2347 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2348 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2349 flags = texture->flags;
2350 tcscale[0] = texture->mipmap[mip][2];
2351 tcscale[1] = texture->mipmap[mip][3];
2352 tciwidth = texture->mipmap[mip][2];
2355 tcimax[0] = texture->mipmap[mip][2]-1;
2356 tcimax[1] = texture->mipmap[mip][3]-1;
2357 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2358 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2359 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2360 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2361 for (x = startx;x < endx;)
2363 unsigned int subtc[2];
2364 unsigned int substep[2];
2365 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2366 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2367 if (nextsub >= endx)
2369 nextsub = endsub = endx-1;
2370 if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2374 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2375 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2376 substep[0] = (endtc[0] - tc[0]) * subscale;
2377 substep[1] = (endtc[1] - tc[1]) * subscale;
2378 subtc[0] = tc[0] * (1<<16);
2379 subtc[1] = tc[1] * (1<<16);
2382 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2384 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2386 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2387 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2388 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2389 tci[0] = subtc[0]>>16;
2390 tci[1] = subtc[1]>>16;
2391 tci1[0] = tci[0] + 1;
2392 tci1[1] = tci[1] + 1;
2393 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2394 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2395 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2396 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2397 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2398 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2399 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2400 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2401 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2402 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2403 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2404 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2405 out4f[x*4+0] = c[0];
2406 out4f[x*4+1] = c[1];
2407 out4f[x*4+2] = c[2];
2408 out4f[x*4+3] = c[3];
2413 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2415 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2416 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2417 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2418 tci[0] = subtc[0]>>16;
2419 tci[1] = subtc[1]>>16;
2420 tci1[0] = tci[0] + 1;
2421 tci1[1] = tci[1] + 1;
2422 tci[0] &= tciwrapmask[0];
2423 tci[1] &= tciwrapmask[1];
2424 tci1[0] &= tciwrapmask[0];
2425 tci1[1] &= tciwrapmask[1];
2426 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2427 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2428 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2429 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2430 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2431 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2432 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2433 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2434 out4f[x*4+0] = c[0];
2435 out4f[x*4+1] = c[1];
2436 out4f[x*4+2] = c[2];
2437 out4f[x*4+3] = c[3];
2441 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2443 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2445 tci[0] = subtc[0]>>16;
2446 tci[1] = subtc[1]>>16;
2447 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2448 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2449 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2450 c[0] = pixel[0][2] * (1.0f / 255.0f);
2451 c[1] = pixel[0][1] * (1.0f / 255.0f);
2452 c[2] = pixel[0][0] * (1.0f / 255.0f);
2453 c[3] = pixel[0][3] * (1.0f / 255.0f);
2454 out4f[x*4+0] = c[0];
2455 out4f[x*4+1] = c[1];
2456 out4f[x*4+2] = c[2];
2457 out4f[x*4+3] = c[3];
2462 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2464 tci[0] = subtc[0]>>16;
2465 tci[1] = subtc[1]>>16;
2466 tci[0] &= tciwrapmask[0];
2467 tci[1] &= tciwrapmask[1];
2468 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2469 c[0] = pixel[0][2] * (1.0f / 255.0f);
2470 c[1] = pixel[0][1] * (1.0f / 255.0f);
2471 c[2] = pixel[0][0] * (1.0f / 255.0f);
2472 c[3] = pixel[0][3] * (1.0f / 255.0f);
2473 out4f[x*4+0] = c[0];
2474 out4f[x*4+1] = c[1];
2475 out4f[x*4+2] = c[2];
2476 out4f[x*4+3] = c[3];
2482 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2486 int startx = span->startx;
2487 int endx = span->endx;
2489 __m128 data, slope, tcscale;
2490 __m128i tcsize, tcmask, tcoffset, tcmax;
2492 __m128i subtc, substep, endsubtc;
2495 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2496 const unsigned char * RESTRICT pixelbase;
2497 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2498 // if no texture is bound, just fill it with white
2501 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2504 mip = triangle->mip[texunitindex];
2505 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2506 // if this mipmap of the texture is 1 pixel, just fill it with that color
2507 if (texture->mipmap[mip][1] == 4)
2509 unsigned int k = *((const unsigned int *)pixelbase);
2510 for (x = startx;x < endx;x++)
2514 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2515 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2516 flags = texture->flags;
2517 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2518 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2519 tcscale = _mm_cvtepi32_ps(tcsize);
2520 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2521 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2522 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2523 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2524 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2525 tcmax = _mm_packs_epi32(tcmask, tcmask);
2526 for (x = startx;x < endx;)
2528 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2529 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2530 if (nextsub >= endx)
2532 nextsub = endsub = endx-1;
2533 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2537 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2538 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2539 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2540 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2541 substep = _mm_slli_epi32(substep, 1);
2544 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2545 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2547 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2548 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2550 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2551 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2552 tci = _mm_madd_epi16(tci, tcoffset);
2553 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2554 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2555 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2556 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2557 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2558 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2559 fracm = _mm_srli_epi16(subtc, 1);
2560 pix1 = _mm_add_epi16(pix1,
2561 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2562 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2563 pix3 = _mm_add_epi16(pix3,
2564 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2565 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2566 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2567 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2568 pix2 = _mm_add_epi16(pix2,
2569 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2570 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2571 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2575 const unsigned char * RESTRICT ptr1;
2576 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2577 tci = _mm_madd_epi16(tci, tcoffset);
2578 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2579 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2580 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2581 fracm = _mm_srli_epi16(subtc, 1);
2582 pix1 = _mm_add_epi16(pix1,
2583 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2584 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2585 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2586 pix1 = _mm_add_epi16(pix1,
2587 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2588 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2589 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2593 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2595 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2597 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2598 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2599 tci = _mm_madd_epi16(tci, tcoffset);
2600 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2601 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2602 _mm_setzero_si128());
2603 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2604 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2605 _mm_setzero_si128());
2606 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2607 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2608 tci = _mm_madd_epi16(tci, tcoffset);
2609 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2610 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2611 _mm_setzero_si128());
2612 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2613 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2614 _mm_setzero_si128());
2615 fracm = _mm_srli_epi16(subtc, 1);
2616 pix1 = _mm_add_epi16(pix1,
2617 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2618 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2619 pix3 = _mm_add_epi16(pix3,
2620 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2621 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2622 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2623 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2624 pix2 = _mm_add_epi16(pix2,
2625 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2626 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2627 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2631 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2632 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2633 tci = _mm_madd_epi16(tci, tcoffset);
2634 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2635 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2636 _mm_setzero_si128());
2637 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2638 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2639 _mm_setzero_si128());
2640 fracm = _mm_srli_epi16(subtc, 1);
2641 pix1 = _mm_add_epi16(pix1,
2642 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2643 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2644 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2645 pix1 = _mm_add_epi16(pix1,
2646 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2647 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2648 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2654 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2656 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2657 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2658 tci = _mm_madd_epi16(tci, tcoffset);
2659 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2660 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2661 _mm_setzero_si128());
2662 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2663 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2664 _mm_setzero_si128());
2665 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2666 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2667 tci = _mm_madd_epi16(tci, tcoffset);
2668 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2669 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2670 _mm_setzero_si128());
2671 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2672 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2673 _mm_setzero_si128());
2674 fracm = _mm_srli_epi16(subtc, 1);
2675 pix1 = _mm_add_epi16(pix1,
2676 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2677 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2678 pix3 = _mm_add_epi16(pix3,
2679 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2680 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2681 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2682 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2683 pix2 = _mm_add_epi16(pix2,
2684 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2685 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2686 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2690 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2691 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2692 tci = _mm_madd_epi16(tci, tcoffset);
2693 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2694 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2695 _mm_setzero_si128());
2696 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2697 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2698 _mm_setzero_si128());
2699 fracm = _mm_srli_epi16(subtc, 1);
2700 pix1 = _mm_add_epi16(pix1,
2701 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2702 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2703 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2704 pix1 = _mm_add_epi16(pix1,
2705 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2706 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2707 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2714 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2716 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2718 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2719 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2720 tci = _mm_madd_epi16(tci, tcoffset);
2721 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2722 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2726 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2727 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2728 tci = _mm_madd_epi16(tci, tcoffset);
2729 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2735 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2737 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2738 tci = _mm_and_si128(tci, tcmax);
2739 tci = _mm_madd_epi16(tci, tcoffset);
2740 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2741 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2745 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2746 tci = _mm_and_si128(tci, tcmax);
2747 tci = _mm_madd_epi16(tci, tcoffset);
2748 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2757 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2760 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2763 float DPSOFTRAST_SampleShadowmap(const float *vector)
2769 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2772 int startx = span->startx;
2773 int endx = span->endx;
2778 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2779 for (x = startx;x < endx;x++)
2782 c[0] = (data[0] + slope[0]*x) * z;
2783 c[1] = (data[1] + slope[1]*x) * z;
2784 c[2] = (data[2] + slope[2]*x) * z;
2785 c[3] = (data[3] + slope[3]*x) * z;
2786 out4f[x*4+0] = in4f[x*4+0] * c[0];
2787 out4f[x*4+1] = in4f[x*4+1] * c[1];
2788 out4f[x*4+2] = in4f[x*4+2] * c[2];
2789 out4f[x*4+3] = in4f[x*4+3] * c[3];
2793 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2796 int startx = span->startx;
2797 int endx = span->endx;
2802 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2803 for (x = startx;x < endx;x++)
2806 c[0] = (data[0] + slope[0]*x) * z;
2807 c[1] = (data[1] + slope[1]*x) * z;
2808 c[2] = (data[2] + slope[2]*x) * z;
2809 c[3] = (data[3] + slope[3]*x) * z;
2810 out4f[x*4+0] = c[0];
2811 out4f[x*4+1] = c[1];
2812 out4f[x*4+2] = c[2];
2813 out4f[x*4+3] = c[3];
2817 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2819 int x, startx = span->startx, endx = span->endx;
2820 float c[4], localcolor[4];
2821 localcolor[0] = subcolor[0];
2822 localcolor[1] = subcolor[1];
2823 localcolor[2] = subcolor[2];
2824 localcolor[3] = subcolor[3];
2825 for (x = startx;x < endx;x++)
2827 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2828 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2829 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2830 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2831 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2832 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2833 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2834 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2838 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2840 int x, startx = span->startx, endx = span->endx;
2841 for (x = startx;x < endx;x++)
2843 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2844 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2845 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2846 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2850 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2852 int x, startx = span->startx, endx = span->endx;
2853 for (x = startx;x < endx;x++)
2855 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2856 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2857 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2858 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2862 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2864 int x, startx = span->startx, endx = span->endx;
2866 for (x = startx;x < endx;x++)
2868 a = 1.0f - inb4f[x*4+3];
2870 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2871 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2872 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2873 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2877 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2879 int x, startx = span->startx, endx = span->endx;
2880 float localcolor[4], ilerp, lerp;
2881 localcolor[0] = color[0];
2882 localcolor[1] = color[1];
2883 localcolor[2] = color[2];
2884 localcolor[3] = color[3];
2885 ilerp = 1.0f - localcolor[3];
2886 lerp = localcolor[3];
2887 for (x = startx;x < endx;x++)
2889 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2890 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2891 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2892 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2898 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2902 int startx = span->startx;
2903 int endx = span->endx;
2906 __m128i submod, substep, endsubmod;
2907 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2908 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2909 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2910 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2911 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2912 for (x = startx; x < endx;)
2914 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2915 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2916 if (nextsub >= endx)
2918 nextsub = endsub = endx-1;
2919 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2923 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2924 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2925 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2926 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2927 substep = _mm_packs_epi32(substep, substep);
2928 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2930 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2931 pix = _mm_mulhi_epu16(pix, submod);
2932 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2936 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2937 pix = _mm_mulhi_epu16(pix, submod);
2938 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2945 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2949 int startx = span->startx;
2950 int endx = span->endx;
2953 __m128i submod, substep, endsubmod;
2954 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2955 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2956 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2957 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2958 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2959 for (x = startx; x < endx;)
2961 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2962 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2963 if (nextsub >= endx)
2965 nextsub = endsub = endx-1;
2966 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2970 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2971 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2972 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2973 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2974 substep = _mm_packs_epi32(substep, substep);
2975 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2977 __m128i pix = _mm_srai_epi16(submod, 4);
2978 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2982 __m128i pix = _mm_srai_epi16(submod, 4);
2983 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2990 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2993 int x, startx = span->startx, endx = span->endx;
2994 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2995 localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2996 for (x = startx;x+2 <= endx;x+=2)
2998 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2999 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3000 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
3001 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3005 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3006 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3007 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
3008 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3013 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3016 int x, startx = span->startx, endx = span->endx;
3017 for (x = startx;x+2 <= endx;x+=2)
3019 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3020 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3021 pix1 = _mm_mulhi_epu16(pix1, pix2);
3022 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3026 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3027 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3028 pix1 = _mm_mulhi_epu16(pix1, pix2);
3029 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3034 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3037 int x, startx = span->startx, endx = span->endx;
3038 for (x = startx;x+2 <= endx;x+=2)
3040 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3041 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3042 pix1 = _mm_add_epi16(pix1, pix2);
3043 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3047 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3048 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3049 pix1 = _mm_add_epi16(pix1, pix2);
3050 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3055 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3058 int x, startx = span->startx, endx = span->endx;
3059 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3060 tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
3061 for (x = startx;x+2 <= endx;x+=2)
3063 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3064 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3065 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3066 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3070 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3071 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3072 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3073 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3078 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3081 int x, startx = span->startx, endx = span->endx;
3082 for (x = startx;x+2 <= endx;x+=2)
3084 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3085 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3086 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3087 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3088 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3092 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3093 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3094 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3095 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3096 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3101 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3104 int x, startx = span->startx, endx = span->endx;
3105 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3106 localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
3107 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3108 for (x = startx;x+2 <= endx;x+=2)
3110 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3111 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3112 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3116 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3117 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3118 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3125 void DPSOFTRAST_VertexShader_Generic(void)
3127 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3128 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3129 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3130 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3131 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3134 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3136 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3137 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3138 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3139 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3140 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3141 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3143 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3144 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3145 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3147 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3148 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3151 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3153 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3156 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3158 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3161 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3166 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3167 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3172 void DPSOFTRAST_VertexShader_PostProcess(void)
3174 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3175 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3176 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3179 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3181 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3182 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3183 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3184 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3185 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3186 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3187 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3189 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3190 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3192 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3193 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3195 // TODO: implement saturation
3197 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3199 // TODO: implement gammaramps
3201 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3206 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3208 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3211 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3213 // this is never called (because colormask is off when this shader is used)
3214 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3215 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3216 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3217 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3218 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3223 void DPSOFTRAST_VertexShader_FlatColor(void)
3225 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3226 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3229 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3232 unsigned char * RESTRICT pixelmask = span->pixelmask;
3233 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3234 int x, startx = span->startx, endx = span->endx;
3235 __m128i Color_Ambientm;
3236 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3237 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3238 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3239 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3240 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3241 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3242 pixel = buffer_FragColorbgra8;
3243 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3244 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3245 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3246 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3247 for (x = startx;x < endx;x++)
3250 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3253 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3254 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3255 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3256 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3262 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3263 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3264 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3266 if (pixel == buffer_FragColorbgra8)
3267 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3273 void DPSOFTRAST_VertexShader_VertexColor(void)
3275 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3276 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3277 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3280 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3283 unsigned char * RESTRICT pixelmask = span->pixelmask;
3284 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3285 int x, startx = span->startx, endx = span->endx;
3286 __m128i Color_Ambientm, Color_Diffusem;
3288 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3289 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3290 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3291 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3292 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3293 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3294 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3295 pixel = buffer_FragColorbgra8;
3296 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3297 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3298 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3299 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3300 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3301 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3302 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3303 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3304 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3305 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3306 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3307 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3308 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3309 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3311 __m128i color, mod, pix;
3312 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3315 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3316 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3317 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3318 data = _mm_add_ps(data, slope);
3319 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3320 data = _mm_add_ps(data, slope);
3321 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3322 data = _mm_add_ps(data, slope);
3323 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3324 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3325 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3326 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3327 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3328 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3334 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3335 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3336 mod = _mm_packs_epi32(mod, mod);
3337 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3338 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3340 if (pixel == buffer_FragColorbgra8)
3341 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3347 void DPSOFTRAST_VertexShader_Lightmap(void)
3349 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3350 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3351 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3354 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3357 unsigned char * RESTRICT pixelmask = span->pixelmask;
3358 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3359 int x, startx = span->startx, endx = span->endx;
3360 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3361 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3362 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3363 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3364 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3365 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3366 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3367 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3368 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3369 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3370 pixel = buffer_FragColorbgra8;
3371 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3372 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3373 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3374 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3375 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3376 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3377 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3378 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3380 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3381 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3382 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3383 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3384 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3385 for (x = startx;x < endx;x++)
3387 __m128i color, lightmap, glow, pix;
3388 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3391 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3392 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3393 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3394 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3395 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3396 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3397 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3398 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3399 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3400 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3406 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3407 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3408 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3409 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3410 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3411 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3416 for (x = startx;x < endx;x++)
3418 __m128i color, lightmap, pix;
3419 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3422 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3423 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3424 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3425 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3426 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3427 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3428 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3434 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3435 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3436 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3437 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3440 if (pixel == buffer_FragColorbgra8)
3441 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3447 void DPSOFTRAST_VertexShader_FakeLight(void)
3449 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3452 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3455 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3456 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3457 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3458 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3459 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3464 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3466 DPSOFTRAST_VertexShader_Lightmap();
3469 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3471 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3477 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3479 DPSOFTRAST_VertexShader_Lightmap();
3482 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3484 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3490 void DPSOFTRAST_VertexShader_LightDirection(void)
3493 int numvertices = dpsoftrast.numvertices;
3495 float LightVector[4];
3496 float EyePosition[4];
3497 float EyeVectorModelSpace[4];
3503 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3504 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3505 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3506 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3507 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3508 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3509 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3510 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3511 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3512 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3513 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3514 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3515 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3516 for (i = 0;i < numvertices;i++)
3518 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3519 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3520 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3521 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3522 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3523 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3524 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3525 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3526 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3527 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3528 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3529 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3530 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3531 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3532 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3533 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3534 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3535 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3536 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3537 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3538 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3539 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3540 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3541 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3542 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3543 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3544 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3545 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3546 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3548 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3551 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3552 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3553 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3554 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3555 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3556 #define DPSOFTRAST_Vector3Normalize(v)\
3559 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3570 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3572 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3573 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3574 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3575 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3576 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3577 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3578 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3579 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3580 int x, startx = span->startx, endx = span->endx;
3581 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3582 float LightVectordata[4];
3583 float LightVectorslope[4];
3584 float EyeVectordata[4];
3585 float EyeVectorslope[4];
3587 float diffusetex[4];
3589 float surfacenormal[4];
3590 float lightnormal[4];
3592 float specularnormal[4];
3595 float SpecularPower;
3597 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3598 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3599 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3600 Color_Glow[3] = 0.0f;
3601 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3602 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3603 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3604 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3605 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3606 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3607 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3608 Color_Pants[3] = 0.0f;
3609 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3610 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3611 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3612 Color_Shirt[3] = 0.0f;
3613 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3614 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3615 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3617 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3618 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3620 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3622 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3624 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3626 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3627 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3628 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3629 Color_Diffuse[3] = 0.0f;
3630 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3631 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3632 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3633 LightColor[3] = 0.0f;
3634 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3635 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3636 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3637 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3638 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3639 Color_Specular[3] = 0.0f;
3640 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3641 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3642 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3643 for (x = startx;x < endx;x++)
3646 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3647 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3648 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3649 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3650 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3652 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3653 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3654 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3655 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3657 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3658 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3659 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3660 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3661 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3662 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3663 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3664 DPSOFTRAST_Vector3Normalize(surfacenormal);
3666 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3667 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3668 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3669 DPSOFTRAST_Vector3Normalize(lightnormal);
3671 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3672 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3673 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3674 DPSOFTRAST_Vector3Normalize(eyenormal);
3676 specularnormal[0] = lightnormal[0] + eyenormal[0];
3677 specularnormal[1] = lightnormal[1] + eyenormal[1];
3678 specularnormal[2] = lightnormal[2] + eyenormal[2];
3679 DPSOFTRAST_Vector3Normalize(specularnormal);
3681 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3682 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3683 specular = pow(specular, SpecularPower * glosstex[3]);
3684 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3686 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3687 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3688 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3689 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3693 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3694 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3695 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3696 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3698 buffer_FragColorbgra8[x*4+0] = d[0];
3699 buffer_FragColorbgra8[x*4+1] = d[1];
3700 buffer_FragColorbgra8[x*4+2] = d[2];
3701 buffer_FragColorbgra8[x*4+3] = d[3];
3704 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3706 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3707 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3708 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3709 Color_Diffuse[3] = 0.0f;
3710 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3711 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3712 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3713 LightColor[3] = 0.0f;
3714 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3715 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3716 for (x = startx;x < endx;x++)
3719 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3720 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3721 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3722 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3723 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3724 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3725 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3726 DPSOFTRAST_Vector3Normalize(surfacenormal);
3728 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3729 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3730 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3731 DPSOFTRAST_Vector3Normalize(lightnormal);
3733 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3734 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3736 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3737 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3738 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3739 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3743 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3744 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3745 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3746 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3748 buffer_FragColorbgra8[x*4+0] = d[0];
3749 buffer_FragColorbgra8[x*4+1] = d[1];
3750 buffer_FragColorbgra8[x*4+2] = d[2];
3751 buffer_FragColorbgra8[x*4+3] = d[3];
3756 for (x = startx;x < endx;x++)
3759 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3760 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3761 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3762 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3764 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3766 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3767 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3768 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3769 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3773 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3774 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3775 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3776 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3778 buffer_FragColorbgra8[x*4+0] = d[0];
3779 buffer_FragColorbgra8[x*4+1] = d[1];
3780 buffer_FragColorbgra8[x*4+2] = d[2];
3781 buffer_FragColorbgra8[x*4+3] = d[3];
3784 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3789 void DPSOFTRAST_VertexShader_LightSource(void)
3792 int numvertices = dpsoftrast.numvertices;
3793 float LightPosition[4];
3794 float LightVector[4];
3795 float LightVectorModelSpace[4];
3796 float EyePosition[4];
3797 float EyeVectorModelSpace[4];
3803 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3804 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3805 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3806 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3807 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3808 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3809 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3810 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3811 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3812 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3813 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3814 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3815 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3816 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3817 for (i = 0;i < numvertices;i++)
3819 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3820 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3821 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3822 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3823 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3824 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3825 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3826 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3827 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3828 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3829 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3830 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3831 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3832 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3833 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3834 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3835 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3836 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
3837 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3838 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3839 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3840 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3841 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3842 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3843 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3844 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3845 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3846 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3847 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3848 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3849 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3850 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3852 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3853 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3856 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3859 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3860 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3861 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3862 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3863 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3864 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3865 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3866 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3867 int x, startx = span->startx, endx = span->endx;
3868 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3869 float CubeVectordata[4];
3870 float CubeVectorslope[4];
3871 float LightVectordata[4];
3872 float LightVectorslope[4];
3873 float EyeVectordata[4];
3874 float EyeVectorslope[4];
3876 float diffusetex[4];
3878 float surfacenormal[4];
3879 float lightnormal[4];
3881 float specularnormal[4];
3884 float SpecularPower;
3885 float CubeVector[4];
3888 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3889 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3890 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3891 Color_Glow[3] = 0.0f;
3892 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3893 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3894 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3895 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3896 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3897 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3898 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3899 Color_Diffuse[3] = 0.0f;
3900 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3901 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3902 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3903 Color_Specular[3] = 0.0f;
3904 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3905 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3906 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3907 Color_Pants[3] = 0.0f;
3908 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3909 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3910 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3911 Color_Shirt[3] = 0.0f;
3912 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3913 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3914 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3915 LightColor[3] = 0.0f;
3916 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3917 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3918 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3919 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3920 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3921 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
3922 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3923 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3925 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3926 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3928 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3929 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
3930 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3932 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3933 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3934 for (x = startx;x < endx;x++)
3937 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3938 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3939 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3940 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3941 if (attenuation < 0.01f)
3943 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3945 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3946 if (attenuation < 0.01f)
3950 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3951 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3952 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3953 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3954 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3956 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3957 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3958 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3959 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3961 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3962 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3963 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3964 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3965 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3966 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3967 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3968 DPSOFTRAST_Vector3Normalize(surfacenormal);
3970 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3971 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3972 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3973 DPSOFTRAST_Vector3Normalize(lightnormal);
3975 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3976 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3977 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3978 DPSOFTRAST_Vector3Normalize(eyenormal);
3980 specularnormal[0] = lightnormal[0] + eyenormal[0];
3981 specularnormal[1] = lightnormal[1] + eyenormal[1];
3982 specularnormal[2] = lightnormal[2] + eyenormal[2];
3983 DPSOFTRAST_Vector3Normalize(specularnormal);
3985 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3986 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3987 specular = pow(specular, SpecularPower * glosstex[3]);
3988 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3990 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3991 attenuation *= (1.0f / 255.0f);
3992 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3993 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3994 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3995 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3999 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4000 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4001 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4002 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4004 buffer_FragColorbgra8[x*4+0] = d[0];
4005 buffer_FragColorbgra8[x*4+1] = d[1];
4006 buffer_FragColorbgra8[x*4+2] = d[2];
4007 buffer_FragColorbgra8[x*4+3] = d[3];
4010 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4012 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4013 for (x = startx;x < endx;x++)
4016 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4017 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4018 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4019 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4020 if (attenuation < 0.01f)
4022 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4024 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4025 if (attenuation < 0.01f)
4029 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4030 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4031 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4032 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4033 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4035 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4036 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4037 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4038 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4040 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4041 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4042 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4043 DPSOFTRAST_Vector3Normalize(surfacenormal);
4045 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4046 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4047 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4048 DPSOFTRAST_Vector3Normalize(lightnormal);
4050 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4051 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4053 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4054 attenuation *= (1.0f / 255.0f);
4055 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4056 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4057 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4058 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4062 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4063 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4064 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4065 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4067 buffer_FragColorbgra8[x*4+0] = d[0];
4068 buffer_FragColorbgra8[x*4+1] = d[1];
4069 buffer_FragColorbgra8[x*4+2] = d[2];
4070 buffer_FragColorbgra8[x*4+3] = d[3];
4075 for (x = startx;x < endx;x++)
4078 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4079 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4080 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4081 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4082 if (attenuation < 0.01f)
4084 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4086 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4087 if (attenuation < 0.01f)
4091 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4092 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4093 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4094 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4095 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4097 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4098 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4099 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4100 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4102 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4104 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4105 attenuation *= (1.0f / 255.0f);
4106 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4107 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4108 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4109 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4113 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4114 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4115 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4116 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4118 buffer_FragColorbgra8[x*4+0] = d[0];
4119 buffer_FragColorbgra8[x*4+1] = d[1];
4120 buffer_FragColorbgra8[x*4+2] = d[2];
4121 buffer_FragColorbgra8[x*4+3] = d[3];
4124 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4130 void DPSOFTRAST_VertexShader_Refraction(void)
4132 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4135 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4138 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4139 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4140 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4141 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4142 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4147 void DPSOFTRAST_VertexShader_Water(void)
4149 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4153 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4156 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4157 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4158 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4159 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4160 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4165 void DPSOFTRAST_VertexShader_ShowDepth(void)
4167 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4170 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4173 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4174 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4175 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4176 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4177 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4182 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4184 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4187 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4190 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4191 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4192 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4193 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4194 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4199 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4201 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4204 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4207 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4208 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4209 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4210 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4211 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4216 typedef struct DPSOFTRAST_ShaderModeInfo_s
4219 void (*Vertex)(void);
4220 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4221 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4222 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4224 DPSOFTRAST_ShaderModeInfo;
4226 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4228 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4229 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4230 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4231 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4232 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4233 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4234 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {~0}, {~0}},
4235 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4236 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4237 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4238 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4239 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {~0}},
4240 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4241 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4242 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4243 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4246 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4253 // unsigned int *colorpixel;
4254 unsigned int *depthpixel;
4260 DPSOFTRAST_State_Triangle *triangle;
4261 DPSOFTRAST_State_Span *span;
4262 unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4263 for (i = 0; i < thread->numspans; i++)
4265 span = &thread->spans[i];
4266 triangle = &thread->triangles[span->triangle];
4267 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4269 wslope = triangle->w[0];
4270 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4271 depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4272 depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4273 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4274 startx = span->startx;
4276 switch(thread->fb_depthfunc)
4279 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4280 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4281 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4282 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4283 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4284 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4285 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4287 //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4288 //for (x = startx;x < endx;x++)
4289 // colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4290 // if there is no color buffer, skip pixel shader
4291 while (startx < endx && !pixelmask[startx])
4293 while (endx > startx && !pixelmask[endx-1])
4296 continue; // no pixels to fill
4297 span->pixelmask = pixelmask;
4298 span->startx = startx;
4300 // run pixel shader if appropriate
4301 // do this before running depthmask code, to allow the pixelshader
4302 // to clear pixelmask values for alpha testing
4303 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4304 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4305 if (thread->depthmask)
4306 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4312 // no depth testing means we're just dealing with color...
4313 // if there is no color buffer, skip pixel shader
4314 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4316 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4317 span->pixelmask = pixelmask;
4318 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4322 thread->numspans = 0;
4325 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4327 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4330 int cullface = thread->cullface;
4331 int minx, maxx, miny, maxy;
4332 int miny1, maxy1, miny2, maxy2;
4333 __m128i fbmin, fbmax;
4334 __m128 viewportcenter, viewportscale;
4335 int firstvertex = command->firstvertex;
4336 int numvertices = command->numvertices;
4337 int numtriangles = command->numtriangles;
4338 const int *element3i = command->element3i;
4339 const unsigned short *element3s = command->element3s;
4340 int clipped = command->clipped;
4347 int starty, endy, bandy;
4351 __m128 triangleedge1, triangleedge2, trianglenormal;
4354 DPSOFTRAST_State_Triangle *triangle;
4355 DPSOFTRAST_Texture *texture;
4356 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4357 miny = thread->fb_scissor[1];
4358 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4359 miny1 = bound(miny, thread->miny1, maxy);
4360 maxy1 = bound(miny, thread->maxy1, maxy);
4361 miny2 = bound(miny, thread->miny2, maxy);
4362 maxy2 = bound(miny, thread->maxy2, maxy);
4363 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4365 if (!ATOMIC_DECREMENT(command->refcount))
4367 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4368 MM_FREE(command->arrays);
4372 minx = thread->fb_scissor[0];
4373 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4374 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4375 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4376 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4377 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4378 screen[3] = _mm_setzero_ps();
4379 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4380 for (i = 0;i < numtriangles;i++)
4382 const float *screencoord4f = command->arrays;
4383 const float *arrays = screencoord4f + numvertices*4;
4385 // generate the 3 edges of this triangle
4386 // generate spans for the triangle - switch based on left split or right split classification of triangle
4389 e[0] = element3s[i*3+0] - firstvertex;
4390 e[1] = element3s[i*3+1] - firstvertex;
4391 e[2] = element3s[i*3+2] - firstvertex;
4395 e[0] = element3i[i*3+0] - firstvertex;
4396 e[1] = element3i[i*3+1] - firstvertex;
4397 e[2] = element3i[i*3+2] - firstvertex;
4406 #define SKIPBACKFACE \
4407 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4408 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4409 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4410 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4411 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4415 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4419 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4424 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4425 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4427 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4428 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4430 #define CLIPPEDVERTEXCOPY(k,p1) \
4431 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4433 #define GENATTRIBCOPY(attrib, p1) \
4434 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4435 #define GENATTRIBLERP(attrib, p1, p2) \
4437 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4438 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4440 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4444 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4445 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4446 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4447 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4448 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4449 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4450 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4456 // calculate distance from nearplane
4457 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4458 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4459 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4460 if (clipdist[0] >= 0.0f)
4462 if (clipdist[1] >= 0.0f)
4464 if (clipdist[2] >= 0.0f)
4467 // triangle is entirely in front of nearplane
4468 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4475 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4483 if (clipdist[2] >= 0.0f)
4485 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4492 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4499 else if (clipdist[1] >= 0.0f)
4501 if (clipdist[2] >= 0.0f)
4503 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4510 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4516 else if (clipdist[2] >= 0.0f)
4518 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4523 else continue; // triangle is entirely behind nearplane
4526 // calculate integer y coords for triangle points
4527 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4528 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4529 screenmin = _mm_min_epi16(screeni, screenir),
4530 screenmax = _mm_max_epi16(screeni, screenir);
4531 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4532 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4533 screenmin = _mm_max_epi16(screenmin, fbmin);
4534 screenmax = _mm_min_epi16(screenmax, fbmax);
4535 // skip offscreen triangles
4536 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4538 starty = _mm_extract_epi16(screenmin, 1);
4539 endy = _mm_extract_epi16(screenmax, 1)+1;
4540 if (starty >= maxy1 && endy <= miny2)
4542 screeny = _mm_srai_epi32(screeni, 16);
4545 triangle = &thread->triangles[thread->numtriangles];
4547 // calculate attribute plans for triangle data...
4548 // okay, this triangle is going to produce spans, we'd better project
4549 // the interpolants now (this is what gives perspective texturing),
4550 // this consists of simply multiplying all arrays by the W coord
4551 // (which is basically 1/Z), which will be undone per-pixel
4552 // (multiplying by Z again) to get the perspective-correct array
4555 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4556 __m128 mipedgescale, mipdensity;
4557 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4558 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4559 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4560 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4561 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4562 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4563 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4564 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4565 attribedge1 = _mm_sub_ss(w0, w1);
4566 attribedge2 = _mm_sub_ss(w2, w1);
4567 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4568 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4569 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4570 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4571 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4572 _mm_store_ss(&triangle->w[0], attribxslope);
4573 _mm_store_ss(&triangle->w[1], attribyslope);
4574 _mm_store_ss(&triangle->w[2], attriborigin);
4575 mipedgescale = _mm_setzero_ps();
4576 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4578 __m128 attrib0, attrib1, attrib2;
4579 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4580 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4582 arrays += numvertices*4;
4583 GENATTRIBS(attrib0, attrib1, attrib2);
4584 attriborigin = _mm_mul_ps(attrib1, w1);
4585 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4586 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4587 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4588 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4589 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4590 _mm_stream_ps(triangle->attribs[k][0], attribxslope);
4591 _mm_stream_ps(triangle->attribs[k][1], attribyslope);
4592 _mm_stream_ps(triangle->attribs[k][2], attriborigin);
4593 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4595 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4596 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4597 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4598 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4602 memset(triangle->mip, 0, sizeof(triangle->mip));
4603 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4605 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4606 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4608 texture = thread->texbound[texunit];
4609 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4611 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4612 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4613 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4614 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4615 // this will be multiplied in the texturing routine by the texture resolution
4616 y = _mm_cvtss_si32(mipdensity);
4619 y = (int)(log((float)y)*0.5f/M_LN2);
4620 if (y > texture->mipmaps - 1)
4621 y = texture->mipmaps - 1;
4622 triangle->mip[texunit] = y;
4628 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4631 __m128 xcoords, xslope;
4632 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4633 int yccmask = _mm_movemask_epi8(ycc);
4634 int edge0p, edge0n, edge1p, edge1n;
4641 case 0xFFFF: /*0000*/ y = endy; continue;
4642 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4643 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4644 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4645 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4646 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4647 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4648 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4649 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4650 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4651 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4652 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4653 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4654 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4655 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4656 case 0x0000: /*1111*/ y++; continue;
4664 case 0xFFFF: /*000*/ y = endy; continue;
4665 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4666 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4667 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4668 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4669 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4670 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4671 case 0x0000: /*111*/ y++; continue;
4674 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
4675 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4676 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4677 nexty = _mm_extract_epi16(ycc, 0);
4678 if (nexty >= bandy) nexty = bandy-1;
4679 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
4680 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4681 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
4682 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4683 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4684 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
4686 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
4687 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
4689 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4691 int startx, endx, offset;
4692 startx = _mm_cvtss_si32(xcoords);
4693 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4696 if (startx < 0) startx = 0;
4697 startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
4699 if (endx > maxx) endx = maxx;
4700 if (startx >= endx) continue;
4701 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
4703 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4704 span->triangle = thread->numtriangles;
4707 span->startx = max(minx - offset, 0);
4708 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
4709 if (span->startx >= span->endx)
4711 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4712 DPSOFTRAST_Draw_ProcessSpans(thread);
4717 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
4719 DPSOFTRAST_Draw_ProcessSpans(thread);
4720 thread->numtriangles = 0;
4724 if (!ATOMIC_DECREMENT(command->refcount))
4726 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4727 MM_FREE(command->arrays);
4730 if (thread->numspans > 0 || thread->numtriangles > 0)
4732 DPSOFTRAST_Draw_ProcessSpans(thread);
4733 thread->numtriangles = 0;
4738 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4742 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
4743 int datasize = 2*numvertices*sizeof(float[4]);
4744 DPSOFTRAST_Command_Draw *command;
4745 unsigned char *data;
4746 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4748 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4749 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4751 datasize += numvertices*sizeof(float[4]);
4754 datasize += numtriangles*sizeof(unsigned short[3]);
4756 datasize += numtriangles*sizeof(int[3]);
4757 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
4758 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
4760 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
4761 data = (unsigned char *)MM_CALLOC(datasize, 1);
4765 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
4766 data = (unsigned char *)command + commandsize;
4768 command->firstvertex = firstvertex;
4769 command->numvertices = numvertices;
4770 command->numtriangles = numtriangles;
4771 command->arrays = (float *)data;
4772 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
4773 dpsoftrast.firstvertex = firstvertex;
4774 dpsoftrast.numvertices = numvertices;
4775 dpsoftrast.screencoord4f = (float *)data;
4776 data += numvertices*sizeof(float[4]);
4777 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
4778 data += numvertices*sizeof(float[4]);
4779 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4781 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4782 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4784 dpsoftrast.post_array4f[j] = (float *)data;
4785 data += numvertices*sizeof(float[4]);
4787 command->element3i = NULL;
4788 command->element3s = NULL;
4791 command->element3s = (unsigned short *)data;
4792 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
4796 command->element3i = (int *)data;
4797 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
4802 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4804 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
4805 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4806 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
4807 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
4808 if (command->starty >= command->endy)
4810 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4811 MM_FREE(command->arrays);
4812 DPSOFTRAST_UndoCommand(command->commandsize);
4815 command->clipped = dpsoftrast.drawclipped;
4816 command->refcount = dpsoftrast.numthreads;
4818 if (dpsoftrast.usethreads)
4821 DPSOFTRAST_Draw_SyncCommands();
4822 for (i = 0; i < dpsoftrast.numthreads; i++)
4824 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4825 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
4826 Thread_CondSignal(thread->drawcond);
4831 DPSOFTRAST_Draw_FlushThreads();
4835 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
4837 int commandoffset = thread->commandoffset;
4838 while (commandoffset != endoffset)
4840 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4841 switch (command->opcode)
4843 #define INTERPCOMMAND(name) \
4844 case DPSOFTRAST_OPCODE_##name : \
4845 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4846 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
4847 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4848 commandoffset = 0; \
4850 INTERPCOMMAND(Viewport)
4851 INTERPCOMMAND(ClearColor)
4852 INTERPCOMMAND(ClearDepth)
4853 INTERPCOMMAND(ColorMask)
4854 INTERPCOMMAND(DepthTest)
4855 INTERPCOMMAND(ScissorTest)
4856 INTERPCOMMAND(Scissor)
4857 INTERPCOMMAND(BlendFunc)
4858 INTERPCOMMAND(BlendSubtract)
4859 INTERPCOMMAND(DepthMask)
4860 INTERPCOMMAND(DepthFunc)
4861 INTERPCOMMAND(DepthRange)
4862 INTERPCOMMAND(PolygonOffset)
4863 INTERPCOMMAND(CullFace)
4864 INTERPCOMMAND(AlphaTest)
4865 INTERPCOMMAND(AlphaFunc)
4866 INTERPCOMMAND(SetTexture)
4867 INTERPCOMMAND(SetShader)
4868 INTERPCOMMAND(Uniform4f)
4869 INTERPCOMMAND(UniformMatrix4f)
4870 INTERPCOMMAND(Uniform1i)
4872 case DPSOFTRAST_OPCODE_Draw:
4873 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
4874 commandoffset += command->commandsize;
4875 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
4877 thread->commandoffset = commandoffset;
4880 case DPSOFTRAST_OPCODE_Reset:
4885 thread->commandoffset = commandoffset;
4888 static int DPSOFTRAST_Draw_Thread(void *data)
4890 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4891 while(thread->index >= 0)
4893 if (thread->commandoffset != dpsoftrast.drawcommand)
4895 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4899 Thread_LockMutex(thread->drawmutex);
4900 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
4902 if (thread->waiting) Thread_CondSignal(thread->waitcond);
4903 thread->starving = true;
4904 Thread_CondWait(thread->drawcond, thread->drawmutex);
4905 thread->starving = false;
4907 Thread_UnlockMutex(thread->drawmutex);
4913 static void DPSOFTRAST_Draw_FlushThreads(void)
4915 DPSOFTRAST_State_Thread *thread;
4917 DPSOFTRAST_Draw_SyncCommands();
4918 if (dpsoftrast.usethreads)
4920 for (i = 0; i < dpsoftrast.numthreads; i++)
4922 thread = &dpsoftrast.threads[i];
4923 if (thread->commandoffset != dpsoftrast.drawcommand)
4925 Thread_LockMutex(thread->drawmutex);
4926 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
4927 Thread_CondSignal(thread->drawcond);
4928 Thread_UnlockMutex(thread->drawmutex);
4931 for (i = 0; i < dpsoftrast.numthreads; i++)
4933 thread = &dpsoftrast.threads[i];
4934 if (thread->commandoffset != dpsoftrast.drawcommand)
4936 Thread_LockMutex(thread->drawmutex);
4937 if (thread->commandoffset != dpsoftrast.drawcommand)
4939 thread->waiting = true;
4940 Thread_CondWait(thread->waitcond, thread->drawmutex);
4941 thread->waiting = false;
4943 Thread_UnlockMutex(thread->drawmutex);
4949 for (i = 0; i < dpsoftrast.numthreads; i++)
4951 thread = &dpsoftrast.threads[i];
4952 if (thread->commandoffset != dpsoftrast.drawcommand)
4953 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4956 dpsoftrast.commandpool.usedcommands = 0;
4959 void DPSOFTRAST_Flush(void)
4961 DPSOFTRAST_Draw_FlushThreads();
4964 void DPSOFTRAST_Finish(void)
4969 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
4979 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4980 dpsoftrast.bigendian = u.b[3];
4981 dpsoftrast.fb_width = width;
4982 dpsoftrast.fb_height = height;
4983 dpsoftrast.fb_depthpixels = depthpixels;
4984 dpsoftrast.fb_colorpixels[0] = colorpixels;
4985 dpsoftrast.fb_colorpixels[1] = NULL;
4986 dpsoftrast.fb_colorpixels[1] = NULL;
4987 dpsoftrast.fb_colorpixels[1] = NULL;
4988 dpsoftrast.viewport[0] = 0;
4989 dpsoftrast.viewport[1] = 0;
4990 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
4991 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
4992 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
4993 dpsoftrast.texture_firstfree = 1;
4994 dpsoftrast.texture_end = 1;
4995 dpsoftrast.texture_max = 0;
4996 dpsoftrast.color[0] = 1;
4997 dpsoftrast.color[1] = 1;
4998 dpsoftrast.color[2] = 1;
4999 dpsoftrast.color[3] = 1;
5000 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5001 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5002 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5003 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5004 for (i = 0; i < dpsoftrast.numthreads; i++)
5006 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5008 thread->cullface = GL_BACK;
5009 thread->colormask[1] = 1;
5010 thread->colormask[2] = 1;
5011 thread->colormask[3] = 1;
5012 thread->blendfunc[0] = GL_ONE;
5013 thread->blendfunc[1] = GL_ZERO;
5014 thread->depthmask = true;
5015 thread->depthtest = true;
5016 thread->depthfunc = GL_LEQUAL;
5017 thread->scissortest = false;
5018 thread->alphatest = false;
5019 thread->alphafunc = GL_GREATER;
5020 thread->alphavalue = 0.5f;
5021 thread->viewport[0] = 0;
5022 thread->viewport[1] = 0;
5023 thread->viewport[2] = dpsoftrast.fb_width;
5024 thread->viewport[3] = dpsoftrast.fb_height;
5025 thread->scissor[0] = 0;
5026 thread->scissor[1] = 0;
5027 thread->scissor[2] = dpsoftrast.fb_width;
5028 thread->scissor[3] = dpsoftrast.fb_height;
5029 thread->depthrange[0] = 0;
5030 thread->depthrange[1] = 1;
5031 thread->polygonoffset[0] = 0;
5032 thread->polygonoffset[1] = 0;
5034 if (dpsoftrast.interlace)
5036 thread->miny1 = (i*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5037 thread->maxy1 = ((i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5038 thread->miny2 = ((dpsoftrast.numthreads+i)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5039 thread->maxy2 = ((dpsoftrast.numthreads+i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5043 thread->miny1 = thread->miny2 = (i*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5044 thread->maxy1 = thread->maxy2 = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5047 thread->numspans = 0;
5048 thread->numtriangles = 0;
5049 thread->commandoffset = 0;
5050 thread->waiting = false;
5051 thread->starving = false;
5053 thread->validate = -1;
5054 DPSOFTRAST_Validate(thread, -1);
5056 if (dpsoftrast.usethreads)
5058 thread->waitcond = Thread_CreateCond();
5059 thread->drawcond = Thread_CreateCond();
5060 thread->drawmutex = Thread_CreateMutex();
5061 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5067 void DPSOFTRAST_Shutdown(void)
5070 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5072 DPSOFTRAST_State_Thread *thread;
5073 for (i = 0; i < dpsoftrast.numthreads; i++)
5075 thread = &dpsoftrast.threads[i];
5076 Thread_LockMutex(thread->drawmutex);
5078 Thread_CondSignal(thread->drawcond);
5079 Thread_UnlockMutex(thread->drawmutex);
5080 Thread_WaitThread(thread->thread, 0);
5081 Thread_DestroyCond(thread->waitcond);
5082 Thread_DestroyCond(thread->drawcond);
5083 Thread_DestroyMutex(thread->drawmutex);
5086 for (i = 0;i < dpsoftrast.texture_end;i++)
5087 if (dpsoftrast.texture[i].bytes)
5088 MM_FREE(dpsoftrast.texture[i].bytes);
5089 if (dpsoftrast.texture)
5090 free(dpsoftrast.texture);
5091 if (dpsoftrast.threads)
5092 MM_FREE(dpsoftrast.threads);
5093 memset(&dpsoftrast, 0, sizeof(dpsoftrast));