3 #define _USE_MATH_DEFINES
6 #include "dpsoftrast.h"
13 typedef qboolean bool;
17 #define ATOMIC_SIZE 32
21 #define ALIGN(var) var __attribute__((__aligned__(16)))
22 #define ATOMIC(var) var __attribute__((__aligned__(32)))
24 #define MEMORY_BARRIER (_mm_sfence())
25 //(__sync_synchronize())
26 #define ATOMIC_COUNTER volatile int
27 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
28 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
29 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
31 #elif defined(_MSC_VER)
32 #define ALIGN(var) __declspec(align(16)) var
33 #define ATOMIC(var) __declspec(align(32)) var
35 #define MEMORY_BARRIER (_mm_sfence())
37 #define ATOMIC_COUNTER volatile LONG
38 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
39 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
40 #define ATOMIC_ADD(counter, val) (InterlockedExchangeAdd(&(counter), (val)))
49 #define ALIGN(var) var
50 #define ATOMIC(var) var
55 #include <SDL_thread.h>
57 #define MEMORY_BARRIER ((void)0)
58 #define ATOMIC_COUNTER int
59 #define ATOMIC_INCREMENT(counter) (++(counter))
60 #define ATOMIC_DECREMENT(counter) (--(counter))
61 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
62 typedef void SDL_Thread;
63 typedef void SDL_cond;
64 typedef void SDL_mutex;
68 #include <emmintrin.h>
70 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
72 static void *MM_CALLOC(size_t nmemb, size_t size)
74 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
75 if (ptr != NULL) memset(ptr, 0, nmemb*size);
79 #define MM_FREE _mm_free
81 #define MM_MALLOC(size) malloc(size)
82 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
86 typedef enum DPSOFTRAST_ARRAY_e
88 DPSOFTRAST_ARRAY_POSITION,
89 DPSOFTRAST_ARRAY_COLOR,
90 DPSOFTRAST_ARRAY_TEXCOORD0,
91 DPSOFTRAST_ARRAY_TEXCOORD1,
92 DPSOFTRAST_ARRAY_TEXCOORD2,
93 DPSOFTRAST_ARRAY_TEXCOORD3,
94 DPSOFTRAST_ARRAY_TEXCOORD4,
95 DPSOFTRAST_ARRAY_TEXCOORD5,
96 DPSOFTRAST_ARRAY_TEXCOORD6,
97 DPSOFTRAST_ARRAY_TEXCOORD7,
98 DPSOFTRAST_ARRAY_TOTAL
102 typedef struct DPSOFTRAST_Texture_s
109 DPSOFTRAST_TEXTURE_FILTER filter;
112 ATOMIC_COUNTER binds;
113 unsigned char *bytes;
114 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
118 #define COMMAND_SIZE ALIGN_SIZE
119 #define COMMAND_ALIGN(var) ALIGN(var)
121 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
123 unsigned char opcode;
124 unsigned short commandsize;
128 enum { DPSOFTRAST_OPCODE_Reset = 0 };
130 #define DEFCOMMAND(opcodeval, name, fields) \
131 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
132 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
134 unsigned char opcode; \
135 unsigned short commandsize; \
137 } DPSOFTRAST_Command_##name );
139 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
140 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
142 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
146 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
148 DPSOFTRAST_State_Command_Pool);
150 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
152 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
154 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
156 DPSOFTRAST_State_Triangle);
158 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
159 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
160 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
161 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
162 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
164 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
165 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
166 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
167 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
168 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
169 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
170 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
171 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
172 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
175 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
177 typedef ALIGN(struct DPSOFTRAST_State_Span_s
179 int triangle; // triangle this span was generated by
180 int x; // framebuffer x coord
181 int y; // framebuffer y coord
182 int startx; // usable range (according to pixelmask)
183 int endx; // usable range (according to pixelmask)
184 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
186 DPSOFTRAST_State_Span);
188 #define DPSOFTRAST_DRAW_MAXSPANS 1024
189 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
191 #define DPSOFTRAST_VALIDATE_FB 1
192 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
193 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
194 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
196 typedef enum DPSOFTRAST_BLENDMODE_e
198 DPSOFTRAST_BLENDMODE_OPAQUE,
199 DPSOFTRAST_BLENDMODE_ALPHA,
200 DPSOFTRAST_BLENDMODE_ADDALPHA,
201 DPSOFTRAST_BLENDMODE_ADD,
202 DPSOFTRAST_BLENDMODE_INVMOD,
203 DPSOFTRAST_BLENDMODE_MUL,
204 DPSOFTRAST_BLENDMODE_MUL2,
205 DPSOFTRAST_BLENDMODE_SUBALPHA,
206 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
207 DPSOFTRAST_BLENDMODE_INVADD,
208 DPSOFTRAST_BLENDMODE_TOTAL
210 DPSOFTRAST_BLENDMODE;
212 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
231 float polygonoffset[2];
234 int shader_permutation;
236 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
238 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
239 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
241 // DPSOFTRAST_VALIDATE_ flags
244 // derived values (DPSOFTRAST_VALIDATE_FB)
247 ALIGN(float fb_viewportcenter[4]);
248 ALIGN(float fb_viewportscale[4]);
250 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
253 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
262 ATOMIC(volatile int commandoffset);
264 volatile bool waiting;
265 volatile bool starving;
268 SDL_mutex *drawmutex;
272 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
273 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
275 DPSOFTRAST_State_Thread);
277 typedef ATOMIC(struct DPSOFTRAST_State_s
281 unsigned int *fb_depthpixels;
282 unsigned int *fb_colorpixels[4];
285 ALIGN(float fb_viewportcenter[4]);
286 ALIGN(float fb_viewportscale[4]);
289 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
290 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
292 const float *pointer_vertex3f;
293 const float *pointer_color4f;
294 const unsigned char *pointer_color4ub;
295 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
298 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
299 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
300 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
304 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
305 float *screencoord4f;
311 int shader_permutation;
315 int texture_firstfree;
316 DPSOFTRAST_Texture *texture;
321 const char *errorstring;
325 DPSOFTRAST_State_Thread *threads;
327 ATOMIC(volatile int drawcommand);
329 DPSOFTRAST_State_Command_Pool commandpool;
333 DPSOFTRAST_State dpsoftrast;
335 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
336 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
337 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
338 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
339 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
341 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
343 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
344 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
345 fb_viewportcenter[3] = 0.5f;
346 fb_viewportcenter[0] = 0.0f;
347 fb_viewportscale[1] = 0.5f * viewport[2];
348 fb_viewportscale[2] = -0.5f * viewport[3];
349 fb_viewportscale[3] = 0.5f;
350 fb_viewportscale[0] = 1.0f;
353 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
355 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
356 // and viewport projection values
359 x1 = thread->scissor[0];
360 x2 = thread->scissor[0] + thread->scissor[2];
361 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
362 y2 = dpsoftrast.fb_height - thread->scissor[1];
363 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
365 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
367 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
368 thread->fb_scissor[0] = x1;
369 thread->fb_scissor[1] = y1;
370 thread->fb_scissor[2] = x2 - x1;
371 thread->fb_scissor[3] = y2 - y1;
373 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
376 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
378 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
381 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
383 if (thread->blendsubtract)
385 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
387 #define BLENDFUNC(sfactor, dfactor, blendmode) \
388 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
389 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
390 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
395 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
397 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
398 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
399 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
400 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
401 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
402 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
403 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
404 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
405 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
406 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
407 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
412 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
414 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
416 mask &= thread->validate;
419 if (mask & DPSOFTRAST_VALIDATE_FB)
421 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
422 DPSOFTRAST_RecalcFB(thread);
424 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
426 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
427 DPSOFTRAST_RecalcDepthFunc(thread);
429 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
431 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
432 DPSOFTRAST_RecalcBlendFunc(thread);
436 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
438 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
439 return &dpsoftrast.texture[index];
443 static void DPSOFTRAST_Texture_Grow(void)
445 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
446 DPSOFTRAST_State_Thread *thread;
450 // expand texture array as needed
451 if (dpsoftrast.texture_max < 1024)
452 dpsoftrast.texture_max = 1024;
454 dpsoftrast.texture_max *= 2;
455 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
456 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
457 if (dpsoftrast.texbound[i])
458 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
459 for (j = 0; j < dpsoftrast.numthreads; j++)
461 thread = &dpsoftrast.threads[j];
462 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
463 if (thread->texbound[i])
464 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
468 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
477 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
478 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
479 DPSOFTRAST_Texture *texture;
480 if (width*height*depth < 1)
482 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
485 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
487 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
492 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
493 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
494 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
496 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
497 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
499 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
504 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
507 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
509 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
514 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
516 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
519 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
521 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
524 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
526 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
529 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
531 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
534 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
536 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
539 // find first empty slot in texture array
540 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
541 if (!dpsoftrast.texture[texnum].bytes)
543 dpsoftrast.texture_firstfree = texnum + 1;
544 if (dpsoftrast.texture_max <= texnum)
545 DPSOFTRAST_Texture_Grow();
546 if (dpsoftrast.texture_end <= texnum)
547 dpsoftrast.texture_end = texnum + 1;
548 texture = &dpsoftrast.texture[texnum];
549 memset(texture, 0, sizeof(*texture));
550 texture->flags = flags;
551 texture->width = width;
552 texture->height = height;
553 texture->depth = depth;
554 texture->sides = sides;
566 s = w * h * d * sides * 4;
567 texture->mipmap[mipmaps][0] = size;
568 texture->mipmap[mipmaps][1] = s;
569 texture->mipmap[mipmaps][2] = w;
570 texture->mipmap[mipmaps][3] = h;
571 texture->mipmap[mipmaps][4] = d;
574 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
580 texture->mipmaps = mipmaps;
581 texture->size = size;
583 // allocate the pixels now
584 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
588 void DPSOFTRAST_Texture_Free(int index)
590 DPSOFTRAST_Texture *texture;
591 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
595 MM_FREE(texture->bytes);
596 texture->bytes = NULL;
597 memset(texture, 0, sizeof(*texture));
598 // adjust the free range and used range
599 if (dpsoftrast.texture_firstfree > index)
600 dpsoftrast.texture_firstfree = index;
601 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
602 dpsoftrast.texture_end--;
604 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
606 int i, x, y, z, w, layer0, layer1, row0, row1;
607 unsigned char *o, *i0, *i1, *i2, *i3;
608 DPSOFTRAST_Texture *texture;
609 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
610 if (texture->mipmaps <= 1)
612 for (i = 1;i < texture->mipmaps;i++)
614 for (z = 0;z < texture->mipmap[i][4];z++)
618 if (layer1 >= texture->mipmap[i-1][4])
619 layer1 = texture->mipmap[i-1][4]-1;
620 for (y = 0;y < texture->mipmap[i][3];y++)
624 if (row1 >= texture->mipmap[i-1][3])
625 row1 = texture->mipmap[i-1][3]-1;
626 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
627 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
628 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
629 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
630 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
631 w = texture->mipmap[i][2];
634 if (texture->mipmap[i-1][2] > 1)
636 // average 3D texture
637 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
639 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
640 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
641 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
642 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
647 // average 3D mipmap with parent width == 1
648 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
650 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
651 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
652 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
653 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
659 if (texture->mipmap[i-1][2] > 1)
661 // average 2D texture (common case)
662 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
664 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
665 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
666 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
667 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
672 // 2D texture with parent width == 1
673 o[0] = (i0[0] + i1[0] + 1) >> 1;
674 o[1] = (i0[1] + i1[1] + 1) >> 1;
675 o[2] = (i0[2] + i1[2] + 1) >> 1;
676 o[3] = (i0[3] + i1[3] + 1) >> 1;
683 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
685 DPSOFTRAST_Texture *texture;
687 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
690 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
691 while (blockheight > 0)
693 memcpy(dst, pixels, blockwidth * 4);
694 pixels += blockwidth * 4;
695 dst += texture->mipmap[0][2] * 4;
698 DPSOFTRAST_Texture_CalculateMipmaps(index);
700 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
702 DPSOFTRAST_Texture *texture;
703 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
706 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
707 DPSOFTRAST_Texture_CalculateMipmaps(index);
709 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
711 DPSOFTRAST_Texture *texture;
712 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
713 return texture->mipmap[mip][2];
715 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
717 DPSOFTRAST_Texture *texture;
718 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
719 return texture->mipmap[mip][3];
721 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
723 DPSOFTRAST_Texture *texture;
724 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
725 return texture->mipmap[mip][4];
727 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
729 DPSOFTRAST_Texture *texture;
730 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
733 return texture->bytes + texture->mipmap[mip][0];
735 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
737 DPSOFTRAST_Texture *texture;
738 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
739 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
741 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
746 texture->filter = filter;
749 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
751 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
752 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
753 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
755 dpsoftrast.fb_width = width;
756 dpsoftrast.fb_height = height;
757 dpsoftrast.fb_depthpixels = depthpixels;
758 dpsoftrast.fb_colorpixels[0] = colorpixels0;
759 dpsoftrast.fb_colorpixels[1] = colorpixels1;
760 dpsoftrast.fb_colorpixels[2] = colorpixels2;
761 dpsoftrast.fb_colorpixels[3] = colorpixels3;
764 static void DPSOFTRAST_Draw_FlushThreads(void);
766 static void DPSOFTRAST_Draw_SyncCommands(void)
769 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
772 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
775 DPSOFTRAST_State_Thread *thread;
777 int freecommand = dpsoftrast.commandpool.freecommand;
778 int usedcommands = dpsoftrast.commandpool.usedcommands;
779 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
781 DPSOFTRAST_Draw_SyncCommands();
787 for (i = 0; i < dpsoftrast.numthreads; i++)
789 thread = &dpsoftrast.threads[i];
790 commandoffset = freecommand - thread->commandoffset;
791 if (commandoffset < 0)
792 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
793 if (commandoffset > usedcommands)
796 usedcommands = commandoffset;
799 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
801 thread = &dpsoftrast.threads[waitindex];
802 SDL_LockMutex(thread->drawmutex);
803 if (thread->commandoffset != dpsoftrast.drawcommand)
805 thread->waiting = true;
806 if (thread->starving) SDL_CondSignal(thread->drawcond);
807 SDL_CondWait(thread->waitcond, thread->drawmutex);
808 thread->waiting = false;
810 SDL_UnlockMutex(thread->drawmutex);
812 dpsoftrast.commandpool.usedcommands = usedcommands;
814 DPSOFTRAST_Draw_FlushThreads();
818 #define DPSOFTRAST_ALIGNCOMMAND(size) \
819 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
820 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
821 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
823 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
825 DPSOFTRAST_Command *command;
826 int freecommand = dpsoftrast.commandpool.freecommand;
827 int usedcommands = dpsoftrast.commandpool.usedcommands;
828 int extra = sizeof(DPSOFTRAST_Command);
829 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
830 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
831 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
833 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
834 freecommand = dpsoftrast.commandpool.freecommand;
835 usedcommands = dpsoftrast.commandpool.usedcommands;
837 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
839 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
840 command->opcode = DPSOFTRAST_OPCODE_Reset;
841 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
844 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
845 command->opcode = opcode;
846 command->commandsize = size;
848 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
850 dpsoftrast.commandpool.freecommand = freecommand;
851 dpsoftrast.commandpool.usedcommands = usedcommands + size;
855 static void DPSOFTRAST_UndoCommand(int size)
857 int freecommand = dpsoftrast.commandpool.freecommand;
858 int usedcommands = dpsoftrast.commandpool.usedcommands;
861 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
862 usedcommands -= size;
863 dpsoftrast.commandpool.freecommand = freecommand;
864 dpsoftrast.commandpool.usedcommands = usedcommands;
867 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
868 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
870 thread->viewport[0] = command->x;
871 thread->viewport[1] = command->y;
872 thread->viewport[2] = command->width;
873 thread->viewport[3] = command->height;
874 thread->validate |= DPSOFTRAST_VALIDATE_FB;
876 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
878 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
881 command->width = width;
882 command->height = height;
884 dpsoftrast.viewport[0] = x;
885 dpsoftrast.viewport[1] = y;
886 dpsoftrast.viewport[2] = width;
887 dpsoftrast.viewport[3] = height;
888 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
891 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
892 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
894 int i, x1, y1, x2, y2, w, h, x, y;
895 int miny1 = thread->miny1;
896 int maxy1 = thread->maxy1;
897 int miny2 = thread->miny2;
898 int maxy2 = thread->maxy2;
902 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
903 x1 = thread->fb_scissor[0];
904 y1 = thread->fb_scissor[1];
905 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
906 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
907 if (y1 < miny1) y1 = miny1;
908 if (y2 > maxy2) y2 = maxy2;
913 // FIXME: honor fb_colormask?
914 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
915 for (i = 0;i < 4;i++)
917 if (!dpsoftrast.fb_colorpixels[i])
919 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
922 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
923 for (x = x1;x < x2;x++)
928 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
930 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
937 DEFCOMMAND(3, ClearDepth, float depth;)
938 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
940 int x1, y1, x2, y2, w, h, x, y;
941 int miny1 = thread->miny1;
942 int maxy1 = thread->maxy1;
943 int miny2 = thread->miny2;
944 int maxy2 = thread->maxy2;
948 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
949 x1 = thread->fb_scissor[0];
950 y1 = thread->fb_scissor[1];
951 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
952 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
953 if (y1 < miny1) y1 = miny1;
954 if (y2 > maxy2) y2 = maxy2;
959 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
960 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
963 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
964 for (x = x1;x < x2;x++)
968 void DPSOFTRAST_ClearDepth(float d)
970 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
974 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
975 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
977 thread->colormask[0] = command->r != 0;
978 thread->colormask[1] = command->g != 0;
979 thread->colormask[2] = command->b != 0;
980 thread->colormask[3] = command->a != 0;
981 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
983 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
985 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
992 DEFCOMMAND(5, DepthTest, int enable;)
993 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
995 thread->depthtest = command->enable;
996 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
998 void DPSOFTRAST_DepthTest(int enable)
1000 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1001 command->enable = enable;
1004 DEFCOMMAND(6, ScissorTest, int enable;)
1005 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1007 thread->scissortest = command->enable;
1008 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1010 void DPSOFTRAST_ScissorTest(int enable)
1012 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1013 command->enable = enable;
1016 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1017 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1019 thread->scissor[0] = command->x;
1020 thread->scissor[1] = command->y;
1021 thread->scissor[2] = command->width;
1022 thread->scissor[3] = command->height;
1023 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1025 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1027 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1030 command->width = width;
1031 command->height = height;
1034 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1035 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1037 thread->blendfunc[0] = command->sfactor;
1038 thread->blendfunc[1] = command->dfactor;
1039 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1041 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1043 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1044 command->sfactor = sfactor;
1045 command->dfactor = dfactor;
1048 DEFCOMMAND(9, BlendSubtract, int enable;)
1049 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1051 thread->blendsubtract = command->enable;
1052 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1054 void DPSOFTRAST_BlendSubtract(int enable)
1056 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1057 command->enable = enable;
1060 DEFCOMMAND(10, DepthMask, int enable;)
1061 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1063 thread->depthmask = command->enable;
1065 void DPSOFTRAST_DepthMask(int enable)
1067 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1068 command->enable = enable;
1071 DEFCOMMAND(11, DepthFunc, int func;)
1072 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1074 thread->depthfunc = command->func;
1076 void DPSOFTRAST_DepthFunc(int func)
1078 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1079 command->func = func;
1082 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1083 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1085 thread->depthrange[0] = command->nearval;
1086 thread->depthrange[1] = command->farval;
1088 void DPSOFTRAST_DepthRange(float nearval, float farval)
1090 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1091 command->nearval = nearval;
1092 command->farval = farval;
1095 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1096 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1098 thread->polygonoffset[0] = command->alongnormal;
1099 thread->polygonoffset[1] = command->intoview;
1101 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1103 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1104 command->alongnormal = alongnormal;
1105 command->intoview = intoview;
1108 DEFCOMMAND(14, CullFace, int mode;)
1109 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1111 thread->cullface = command->mode;
1113 void DPSOFTRAST_CullFace(int mode)
1115 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1116 command->mode = mode;
1119 DEFCOMMAND(15, AlphaTest, int enable;)
1120 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1122 thread->alphatest = command->enable;
1124 void DPSOFTRAST_AlphaTest(int enable)
1126 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1127 command->enable = enable;
1130 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1131 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1133 thread->alphafunc = command->func;
1134 thread->alphavalue = command->ref;
1136 void DPSOFTRAST_AlphaFunc(int func, float ref)
1138 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1139 command->func = func;
1143 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1145 dpsoftrast.color[0] = r;
1146 dpsoftrast.color[1] = g;
1147 dpsoftrast.color[2] = b;
1148 dpsoftrast.color[3] = a;
1151 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1153 int outstride = blockwidth * 4;
1154 int instride = dpsoftrast.fb_width * 4;
1157 int bx2 = blockx + blockwidth;
1158 int by2 = blocky + blockheight;
1163 unsigned char *inpixels;
1167 if (bx1 < 0) bx1 = 0;
1168 if (by1 < 0) by1 = 0;
1169 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1170 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1173 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1174 if (dpsoftrast.bigendian)
1176 for (y = by1;y < by2;y++)
1178 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1179 o = (unsigned char *)outpixels + (y - by1) * outstride;
1180 for (x = bx1;x < bx2;x++)
1193 for (y = by1;y < by2;y++)
1195 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1196 o = (unsigned char *)outpixels + (y - by1) * outstride;
1202 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1206 int tx2 = tx + width;
1207 int ty2 = ty + height;
1210 int sx2 = sx + width;
1211 int sy2 = sy + height;
1221 unsigned int *spixels;
1222 unsigned int *tpixels;
1223 DPSOFTRAST_Texture *texture;
1224 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1225 if (mip < 0 || mip >= texture->mipmaps) return;
1228 spixels = dpsoftrast.fb_colorpixels[0];
1229 swidth = dpsoftrast.fb_width;
1230 sheight = dpsoftrast.fb_height;
1231 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1232 twidth = texture->mipmap[mip][2];
1233 theight = texture->mipmap[mip][3];
1234 if (tx1 < 0) tx1 = 0;
1235 if (ty1 < 0) ty1 = 0;
1236 if (tx2 > twidth) tx2 = twidth;
1237 if (ty2 > theight) ty2 = theight;
1238 if (sx1 < 0) sx1 = 0;
1239 if (sy1 < 0) sy1 = 0;
1240 if (sx2 > swidth) sx2 = swidth;
1241 if (sy2 > sheight) sy2 = sheight;
1246 if (tw > sw) tw = sw;
1247 if (th > sh) th = sh;
1248 if (tw < 1 || th < 1)
1250 for (y = 0;y < th;y++)
1251 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1252 if (texture->mipmaps > 1)
1253 DPSOFTRAST_Texture_CalculateMipmaps(index);
1256 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1257 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1259 if (thread->texbound[command->unitnum])
1260 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1261 thread->texbound[command->unitnum] = command->texture;
1263 void DPSOFTRAST_SetTexture(int unitnum, int index)
1265 DPSOFTRAST_Command_SetTexture *command;
1266 DPSOFTRAST_Texture *texture;
1267 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1269 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1272 texture = DPSOFTRAST_Texture_GetByIndex(index);
1273 if (index && !texture)
1275 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1279 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1280 command->unitnum = unitnum;
1281 command->texture = texture;
1283 dpsoftrast.texbound[unitnum] = texture;
1284 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1287 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1289 dpsoftrast.pointer_vertex3f = vertex3f;
1290 dpsoftrast.stride_vertex = stride;
1292 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1294 dpsoftrast.pointer_color4f = color4f;
1295 dpsoftrast.pointer_color4ub = NULL;
1296 dpsoftrast.stride_color = stride;
1298 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1300 dpsoftrast.pointer_color4f = NULL;
1301 dpsoftrast.pointer_color4ub = color4ub;
1302 dpsoftrast.stride_color = stride;
1304 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1306 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1307 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1308 dpsoftrast.stride_texcoord[unitnum] = stride;
1311 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1312 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1314 thread->shader_mode = command->mode;
1315 thread->shader_permutation = command->permutation;
1317 void DPSOFTRAST_SetShader(int mode, int permutation)
1319 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1320 command->mode = mode;
1321 command->permutation = permutation;
1323 dpsoftrast.shader_mode = mode;
1324 dpsoftrast.shader_permutation = permutation;
1327 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1328 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1330 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1332 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1334 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1335 command->index = index;
1336 command->val[0] = v0;
1337 command->val[1] = v1;
1338 command->val[2] = v2;
1339 command->val[3] = v3;
1341 dpsoftrast.uniform4f[index*4+0] = v0;
1342 dpsoftrast.uniform4f[index*4+1] = v1;
1343 dpsoftrast.uniform4f[index*4+2] = v2;
1344 dpsoftrast.uniform4f[index*4+3] = v3;
1346 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1348 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1349 command->index = index;
1350 memcpy(command->val, v, sizeof(command->val));
1352 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1355 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1356 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1358 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1360 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1364 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1366 __m128 m0, m1, m2, m3;
1367 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1368 command->index = index;
1369 if (((size_t)v)&(ALIGN_SIZE-1))
1371 m0 = _mm_loadu_ps(v);
1372 m1 = _mm_loadu_ps(v+4);
1373 m2 = _mm_loadu_ps(v+8);
1374 m3 = _mm_loadu_ps(v+12);
1378 m0 = _mm_load_ps(v);
1379 m1 = _mm_load_ps(v+4);
1380 m2 = _mm_load_ps(v+8);
1381 m3 = _mm_load_ps(v+12);
1385 __m128 t0, t1, t2, t3;
1386 t0 = _mm_unpacklo_ps(m0, m1);
1387 t1 = _mm_unpacklo_ps(m2, m3);
1388 t2 = _mm_unpackhi_ps(m0, m1);
1389 t3 = _mm_unpackhi_ps(m2, m3);
1390 m0 = _mm_movelh_ps(t0, t1);
1391 m1 = _mm_movehl_ps(t1, t0);
1392 m2 = _mm_movelh_ps(t2, t3);
1393 m3 = _mm_movehl_ps(t3, t2);
1395 _mm_store_ps(command->val, m0);
1396 _mm_store_ps(command->val+4, m1);
1397 _mm_store_ps(command->val+8, m2);
1398 _mm_store_ps(command->val+12, m3);
1399 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1400 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1401 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1402 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1407 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1408 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1410 thread->uniform1i[command->index] = command->val;
1412 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1414 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1415 command->index = index;
1418 dpsoftrast.uniform1i[command->index] = i0;
1422 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1424 float *end = dst + size*4;
1425 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1429 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1438 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1445 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1447 float *end = dst + size*4;
1448 if (stride == sizeof(float[3]))
1450 float *end4 = dst + (size&~3)*4;
1451 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1455 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1456 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1457 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1458 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1459 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1460 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1461 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1462 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1463 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1464 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1465 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1466 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1467 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1469 src += 4*sizeof(float[3]);
1476 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1477 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1478 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1479 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1480 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1481 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1482 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1483 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1484 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1485 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1486 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1487 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1488 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1490 src += 4*sizeof(float[3]);
1494 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1498 __m128 v = _mm_loadu_ps((const float *)src);
1499 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1500 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1501 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1502 _mm_store_ps(dst, v);
1511 __m128 v = _mm_load_ps((const float *)src);
1512 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1513 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1514 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1515 _mm_store_ps(dst, v);
1522 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1524 float *end = dst + size*4;
1525 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1526 if (stride == sizeof(float[2]))
1528 float *end2 = dst + (size&~1)*4;
1529 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1533 __m128 v = _mm_loadu_ps((const float *)src);
1534 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1535 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1537 src += 2*sizeof(float[2]);
1544 __m128 v = _mm_load_ps((const float *)src);
1545 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1546 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1548 src += 2*sizeof(float[2]);
1554 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1560 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1562 float *end = dst + size*4;
1563 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1564 if (stride == sizeof(unsigned char[4]))
1566 float *end4 = dst + (size&~3)*4;
1567 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1571 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1572 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1573 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1574 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1575 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1577 src += 4*sizeof(unsigned char[4]);
1584 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1585 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1586 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1587 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1588 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1590 src += 4*sizeof(unsigned char[4]);
1596 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1597 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1603 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1605 float *end = dst + 4*size;
1606 __m128 v = _mm_loadu_ps(src);
1609 _mm_store_ps(dst, v);
1615 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1618 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1619 __m128 m0, m1, m2, m3;
1621 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1623 // fast case for identity matrix
1624 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1627 end = out4f + numitems*4;
1628 m0 = _mm_loadu_ps(inmatrix16f);
1629 m1 = _mm_loadu_ps(inmatrix16f + 4);
1630 m2 = _mm_loadu_ps(inmatrix16f + 8);
1631 m3 = _mm_loadu_ps(inmatrix16f + 12);
1632 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1636 __m128 v = _mm_loadu_ps(in4f);
1638 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1639 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1640 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1641 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1650 __m128 v = _mm_load_ps(in4f);
1652 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1653 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1654 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1655 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1663 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1665 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1669 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1671 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1672 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1673 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1674 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1677 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1679 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1680 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1681 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1682 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1685 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1688 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1689 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1690 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1691 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1694 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1696 int clipmask = 0xFF;
1697 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1698 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1699 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1700 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1701 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1702 #define BBFRONT(k, pos) \
1704 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1705 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1706 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1709 clipmask &= ~(1<<k); \
1710 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1711 minproj = _mm_min_ss(minproj, proj); \
1712 maxproj = _mm_max_ss(maxproj, proj); \
1716 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1717 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1718 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1719 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1720 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1721 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1725 if (clipmask&(1<<k)) \
1727 if (!(clipmask&(1<<(k^1)))) \
1729 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1730 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1731 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1732 minproj = _mm_min_ss(minproj, proj); \
1733 maxproj = _mm_max_ss(maxproj, proj); \
1735 if (!(clipmask&(1<<(k^2)))) \
1737 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1738 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1739 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1740 minproj = _mm_min_ss(minproj, proj); \
1741 maxproj = _mm_max_ss(maxproj, proj); \
1743 if (!(clipmask&(1<<(k^4)))) \
1745 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1746 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1747 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1748 minproj = _mm_min_ss(minproj, proj); \
1749 maxproj = _mm_max_ss(maxproj, proj); \
1753 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1754 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1755 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1756 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1757 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1758 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1759 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1760 *starty = _mm_cvttss_si32(maxproj);
1761 *endy = _mm_cvttss_si32(minproj)+1;
1766 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1769 float *end = out4f + numitems*4;
1770 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1771 __m128 minpos, maxpos;
1772 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1774 minpos = maxpos = _mm_loadu_ps(in4f);
1777 __m128 v = _mm_loadu_ps(in4f);
1778 minpos = _mm_min_ps(minpos, v);
1779 maxpos = _mm_max_ps(maxpos, v);
1780 _mm_store_ps(out4f, v);
1781 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1782 _mm_store_ps(screen4f, v);
1790 minpos = maxpos = _mm_load_ps(in4f);
1793 __m128 v = _mm_load_ps(in4f);
1794 minpos = _mm_min_ps(minpos, v);
1795 maxpos = _mm_max_ps(maxpos, v);
1796 _mm_store_ps(out4f, v);
1797 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1798 _mm_store_ps(screen4f, v);
1805 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale,
1806 _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1807 _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1808 _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1809 _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1814 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1817 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1818 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1820 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1821 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1822 end = out4f + numitems*4;
1823 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1824 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1825 m0 = _mm_loadu_ps(inmatrix16f);
1826 m1 = _mm_loadu_ps(inmatrix16f + 4);
1827 m2 = _mm_loadu_ps(inmatrix16f + 8);
1828 m3 = _mm_loadu_ps(inmatrix16f + 12);
1829 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1831 minpos = maxpos = _mm_loadu_ps(in4f);
1834 __m128 v = _mm_loadu_ps(in4f);
1835 minpos = _mm_min_ps(minpos, v);
1836 maxpos = _mm_max_ps(maxpos, v);
1837 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1838 _mm_store_ps(out4f, v);
1839 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1840 _mm_store_ps(screen4f, v);
1848 minpos = maxpos = _mm_load_ps(in4f);
1851 __m128 v = _mm_load_ps(in4f);
1852 minpos = _mm_min_ps(minpos, v);
1853 maxpos = _mm_max_ps(maxpos, v);
1854 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1855 _mm_store_ps(out4f, v);
1856 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1857 _mm_store_ps(screen4f, v);
1864 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3);
1869 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1871 float *outf = dpsoftrast.post_array4f[outarray];
1872 const unsigned char *inb;
1873 int firstvertex = dpsoftrast.firstvertex;
1874 int numvertices = dpsoftrast.numvertices;
1878 case DPSOFTRAST_ARRAY_POSITION:
1879 stride = dpsoftrast.stride_vertex;
1880 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1881 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1883 case DPSOFTRAST_ARRAY_COLOR:
1884 stride = dpsoftrast.stride_color;
1885 if (dpsoftrast.pointer_color4f)
1887 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1888 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1890 else if (dpsoftrast.pointer_color4ub)
1892 stride = dpsoftrast.stride_color;
1893 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1894 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1898 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1902 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1903 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1905 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1906 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1909 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1912 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1915 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1924 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1926 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1927 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1932 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1934 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1935 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1940 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1942 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1943 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1947 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1950 int startx = span->startx;
1951 int endx = span->endx;
1952 float wslope = triangle->w[0];
1953 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1954 float endz = 1.0f / (w + wslope * startx);
1955 for (x = startx;x < endx;)
1957 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1959 if (nextsub >= endx) nextsub = endsub = endx-1;
1960 endz = 1.0f / (w + wslope * nextsub);
1961 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1962 for (; x <= endsub; x++, z += dz)
1967 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1970 int startx = span->startx;
1971 int endx = span->endx;
1974 unsigned char * RESTRICT pixelmask = span->pixelmask;
1975 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1978 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1979 // handle alphatest now (this affects depth writes too)
1980 if (thread->alphatest)
1981 for (x = startx;x < endx;x++)
1982 if (in4f[x*4+3] < 0.5f)
1983 pixelmask[x] = false;
1984 // FIXME: this does not handle bigendian
1985 switch(thread->fb_blendmode)
1987 case DPSOFTRAST_BLENDMODE_OPAQUE:
1988 for (x = startx;x < endx;x++)
1992 d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
1993 d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
1994 d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
1995 d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
1996 pixel[x*4+0] = d[0];
1997 pixel[x*4+1] = d[1];
1998 pixel[x*4+2] = d[2];
1999 pixel[x*4+3] = d[3];
2002 case DPSOFTRAST_BLENDMODE_ALPHA:
2003 for (x = startx;x < endx;x++)
2007 a = in4f[x*4+3] * 255.0f;
2008 b = 1.0f - in4f[x*4+3];
2009 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2010 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2011 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2012 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2013 pixel[x*4+0] = d[0];
2014 pixel[x*4+1] = d[1];
2015 pixel[x*4+2] = d[2];
2016 pixel[x*4+3] = d[3];
2019 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2020 for (x = startx;x < endx;x++)
2024 a = in4f[x*4+3] * 255.0f;
2025 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2026 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2027 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2028 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2029 pixel[x*4+0] = d[0];
2030 pixel[x*4+1] = d[1];
2031 pixel[x*4+2] = d[2];
2032 pixel[x*4+3] = d[3];
2035 case DPSOFTRAST_BLENDMODE_ADD:
2036 for (x = startx;x < endx;x++)
2040 d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2041 d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2042 d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2043 d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2044 pixel[x*4+0] = d[0];
2045 pixel[x*4+1] = d[1];
2046 pixel[x*4+2] = d[2];
2047 pixel[x*4+3] = d[3];
2050 case DPSOFTRAST_BLENDMODE_INVMOD:
2051 for (x = startx;x < endx;x++)
2055 d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2056 d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2057 d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2058 d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2059 pixel[x*4+0] = d[0];
2060 pixel[x*4+1] = d[1];
2061 pixel[x*4+2] = d[2];
2062 pixel[x*4+3] = d[3];
2065 case DPSOFTRAST_BLENDMODE_MUL:
2066 for (x = startx;x < endx;x++)
2070 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2071 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2072 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2073 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2074 pixel[x*4+0] = d[0];
2075 pixel[x*4+1] = d[1];
2076 pixel[x*4+2] = d[2];
2077 pixel[x*4+3] = d[3];
2080 case DPSOFTRAST_BLENDMODE_MUL2:
2081 for (x = startx;x < endx;x++)
2085 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2086 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2087 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2088 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2089 pixel[x*4+0] = d[0];
2090 pixel[x*4+1] = d[1];
2091 pixel[x*4+2] = d[2];
2092 pixel[x*4+3] = d[3];
2095 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2096 for (x = startx;x < endx;x++)
2100 a = in4f[x*4+3] * -255.0f;
2101 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2102 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2103 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2104 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2105 pixel[x*4+0] = d[0];
2106 pixel[x*4+1] = d[1];
2107 pixel[x*4+2] = d[2];
2108 pixel[x*4+3] = d[3];
2111 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2112 for (x = startx;x < endx;x++)
2117 b = 1.0f - in4f[x*4+3];
2118 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2119 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2120 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2121 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2122 pixel[x*4+0] = d[0];
2123 pixel[x*4+1] = d[1];
2124 pixel[x*4+2] = d[2];
2125 pixel[x*4+3] = d[3];
2128 case DPSOFTRAST_BLENDMODE_INVADD:
2129 for (x = startx;x < endx;x++)
2133 d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2134 d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2135 d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2136 d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2137 pixel[x*4+0] = d[0];
2138 pixel[x*4+1] = d[1];
2139 pixel[x*4+2] = d[2];
2140 pixel[x*4+3] = d[3];
2146 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2150 int startx = span->startx;
2151 int endx = span->endx;
2152 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2153 unsigned char * RESTRICT pixelmask = span->pixelmask;
2154 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2155 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2158 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2159 pixeli += span->y * dpsoftrast.fb_width + span->x;
2160 // handle alphatest now (this affects depth writes too)
2161 if (thread->alphatest)
2162 for (x = startx;x < endx;x++)
2163 if (in4ub[x*4+3] < 0.5f)
2164 pixelmask[x] = false;
2165 // FIXME: this does not handle bigendian
2166 switch(thread->fb_blendmode)
2168 case DPSOFTRAST_BLENDMODE_OPAQUE:
2169 for (x = startx;x + 4 <= endx;)
2171 if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2173 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2187 case DPSOFTRAST_BLENDMODE_ALPHA:
2188 #define FINISHBLEND(blend2, blend1) \
2189 for (x = startx;x + 1 < endx;x += 2) \
2192 switch (*(const unsigned short*)&pixelmask[x]) \
2195 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2196 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2198 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2201 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2202 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2204 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2207 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2208 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2210 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2215 for(;x < endx; x++) \
2218 if (!pixelmask[x]) \
2220 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2221 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2223 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2227 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2228 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2230 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2231 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2234 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2236 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2237 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2239 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2240 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2243 case DPSOFTRAST_BLENDMODE_ADD:
2244 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2246 case DPSOFTRAST_BLENDMODE_INVMOD:
2248 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2250 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2253 case DPSOFTRAST_BLENDMODE_MUL:
2254 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2256 case DPSOFTRAST_BLENDMODE_MUL2:
2257 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2259 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2261 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2262 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2264 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2265 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2268 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2270 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2271 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2273 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2274 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2277 case DPSOFTRAST_BLENDMODE_INVADD:
2279 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2281 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2288 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2291 int startx = span->startx;
2292 int endx = span->endx;
2297 float tc[2], endtc[2];
2299 unsigned int tci[2];
2300 unsigned int tci1[2];
2301 unsigned int tcimin[2];
2302 unsigned int tcimax[2];
2307 const unsigned char * RESTRICT pixelbase;
2308 const unsigned char * RESTRICT pixel[4];
2309 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2310 // if no texture is bound, just fill it with white
2313 for (x = startx;x < endx;x++)
2315 out4f[x*4+0] = 1.0f;
2316 out4f[x*4+1] = 1.0f;
2317 out4f[x*4+2] = 1.0f;
2318 out4f[x*4+3] = 1.0f;
2322 mip = triangle->mip[texunitindex];
2323 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2324 // if this mipmap of the texture is 1 pixel, just fill it with that color
2325 if (texture->mipmap[mip][1] == 4)
2327 c[0] = texture->bytes[2] * (1.0f/255.0f);
2328 c[1] = texture->bytes[1] * (1.0f/255.0f);
2329 c[2] = texture->bytes[0] * (1.0f/255.0f);
2330 c[3] = texture->bytes[3] * (1.0f/255.0f);
2331 for (x = startx;x < endx;x++)
2333 out4f[x*4+0] = c[0];
2334 out4f[x*4+1] = c[1];
2335 out4f[x*4+2] = c[2];
2336 out4f[x*4+3] = c[3];
2340 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2341 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2342 flags = texture->flags;
2343 tcscale[0] = texture->mipmap[mip][2];
2344 tcscale[1] = texture->mipmap[mip][3];
2345 tciwidth = texture->mipmap[mip][2];
2348 tcimax[0] = texture->mipmap[mip][2]-1;
2349 tcimax[1] = texture->mipmap[mip][3]-1;
2350 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2351 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2352 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2353 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2354 for (x = startx;x < endx;)
2356 unsigned int subtc[2];
2357 unsigned int substep[2];
2358 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2359 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2360 if (nextsub >= endx)
2362 nextsub = endsub = endx-1;
2363 if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2367 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2368 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2369 substep[0] = (endtc[0] - tc[0]) * subscale;
2370 substep[1] = (endtc[1] - tc[1]) * subscale;
2371 subtc[0] = tc[0] * (1<<16);
2372 subtc[1] = tc[1] * (1<<16);
2375 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2377 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2379 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2380 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2381 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2382 tci[0] = subtc[0]>>16;
2383 tci[1] = subtc[1]>>16;
2384 tci1[0] = tci[0] + 1;
2385 tci1[1] = tci[1] + 1;
2386 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2387 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2388 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2389 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2390 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2391 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2392 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2393 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2394 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2395 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2396 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2397 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2398 out4f[x*4+0] = c[0];
2399 out4f[x*4+1] = c[1];
2400 out4f[x*4+2] = c[2];
2401 out4f[x*4+3] = c[3];
2406 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2408 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2409 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2410 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2411 tci[0] = subtc[0]>>16;
2412 tci[1] = subtc[1]>>16;
2413 tci1[0] = tci[0] + 1;
2414 tci1[1] = tci[1] + 1;
2415 tci[0] &= tciwrapmask[0];
2416 tci[1] &= tciwrapmask[1];
2417 tci1[0] &= tciwrapmask[0];
2418 tci1[1] &= tciwrapmask[1];
2419 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2420 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2421 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2422 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2423 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2424 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2425 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2426 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2427 out4f[x*4+0] = c[0];
2428 out4f[x*4+1] = c[1];
2429 out4f[x*4+2] = c[2];
2430 out4f[x*4+3] = c[3];
2434 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2436 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2438 tci[0] = subtc[0]>>16;
2439 tci[1] = subtc[1]>>16;
2440 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2441 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2442 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2443 c[0] = pixel[0][2] * (1.0f / 255.0f);
2444 c[1] = pixel[0][1] * (1.0f / 255.0f);
2445 c[2] = pixel[0][0] * (1.0f / 255.0f);
2446 c[3] = pixel[0][3] * (1.0f / 255.0f);
2447 out4f[x*4+0] = c[0];
2448 out4f[x*4+1] = c[1];
2449 out4f[x*4+2] = c[2];
2450 out4f[x*4+3] = c[3];
2455 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2457 tci[0] = subtc[0]>>16;
2458 tci[1] = subtc[1]>>16;
2459 tci[0] &= tciwrapmask[0];
2460 tci[1] &= tciwrapmask[1];
2461 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2462 c[0] = pixel[0][2] * (1.0f / 255.0f);
2463 c[1] = pixel[0][1] * (1.0f / 255.0f);
2464 c[2] = pixel[0][0] * (1.0f / 255.0f);
2465 c[3] = pixel[0][3] * (1.0f / 255.0f);
2466 out4f[x*4+0] = c[0];
2467 out4f[x*4+1] = c[1];
2468 out4f[x*4+2] = c[2];
2469 out4f[x*4+3] = c[3];
2475 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2479 int startx = span->startx;
2480 int endx = span->endx;
2482 __m128 data, slope, tcscale;
2483 __m128i tcsize, tcmask, tcoffset, tcmax;
2485 __m128i subtc, substep, endsubtc;
2488 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2489 const unsigned char * RESTRICT pixelbase;
2490 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2491 // if no texture is bound, just fill it with white
2494 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2497 mip = triangle->mip[texunitindex];
2498 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2499 // if this mipmap of the texture is 1 pixel, just fill it with that color
2500 if (texture->mipmap[mip][1] == 4)
2502 unsigned int k = *((const unsigned int *)pixelbase);
2503 for (x = startx;x < endx;x++)
2507 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2508 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2509 flags = texture->flags;
2510 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2511 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2512 tcscale = _mm_cvtepi32_ps(tcsize);
2513 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2514 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2515 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2516 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2517 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2518 tcmax = _mm_packs_epi32(tcmask, tcmask);
2519 for (x = startx;x < endx;)
2521 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2522 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2523 if (nextsub >= endx)
2525 nextsub = endsub = endx-1;
2526 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2530 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2531 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2532 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2533 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2534 substep = _mm_slli_epi32(substep, 1);
2537 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2538 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2540 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2541 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2543 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2544 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2545 tci = _mm_madd_epi16(tci, tcoffset);
2546 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2547 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2548 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2549 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2550 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2551 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2552 fracm = _mm_srli_epi16(subtc, 1);
2553 pix1 = _mm_add_epi16(pix1,
2554 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2555 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2556 pix3 = _mm_add_epi16(pix3,
2557 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2558 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2559 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2560 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2561 pix2 = _mm_add_epi16(pix2,
2562 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2563 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2564 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2568 const unsigned char * RESTRICT ptr1;
2569 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2570 tci = _mm_madd_epi16(tci, tcoffset);
2571 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2572 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2573 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2574 fracm = _mm_srli_epi16(subtc, 1);
2575 pix1 = _mm_add_epi16(pix1,
2576 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2577 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2578 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2579 pix1 = _mm_add_epi16(pix1,
2580 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2581 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2582 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2586 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2588 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2590 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2591 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2592 tci = _mm_madd_epi16(tci, tcoffset);
2593 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2594 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2595 _mm_setzero_si128());
2596 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2597 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2598 _mm_setzero_si128());
2599 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2600 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2601 tci = _mm_madd_epi16(tci, tcoffset);
2602 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2603 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2604 _mm_setzero_si128());
2605 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2606 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2607 _mm_setzero_si128());
2608 fracm = _mm_srli_epi16(subtc, 1);
2609 pix1 = _mm_add_epi16(pix1,
2610 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2611 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2612 pix3 = _mm_add_epi16(pix3,
2613 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2614 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2615 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2616 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2617 pix2 = _mm_add_epi16(pix2,
2618 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2619 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2620 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2624 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2625 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2626 tci = _mm_madd_epi16(tci, tcoffset);
2627 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2628 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2629 _mm_setzero_si128());
2630 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2631 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2632 _mm_setzero_si128());
2633 fracm = _mm_srli_epi16(subtc, 1);
2634 pix1 = _mm_add_epi16(pix1,
2635 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2636 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2637 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2638 pix1 = _mm_add_epi16(pix1,
2639 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2640 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2641 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2647 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2649 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2650 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2651 tci = _mm_madd_epi16(tci, tcoffset);
2652 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2653 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2654 _mm_setzero_si128());
2655 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2656 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2657 _mm_setzero_si128());
2658 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2659 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2660 tci = _mm_madd_epi16(tci, tcoffset);
2661 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2662 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2663 _mm_setzero_si128());
2664 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2665 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2666 _mm_setzero_si128());
2667 fracm = _mm_srli_epi16(subtc, 1);
2668 pix1 = _mm_add_epi16(pix1,
2669 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2670 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2671 pix3 = _mm_add_epi16(pix3,
2672 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2673 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2674 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2675 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2676 pix2 = _mm_add_epi16(pix2,
2677 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2678 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2679 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2683 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2684 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2685 tci = _mm_madd_epi16(tci, tcoffset);
2686 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2687 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2688 _mm_setzero_si128());
2689 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2690 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2691 _mm_setzero_si128());
2692 fracm = _mm_srli_epi16(subtc, 1);
2693 pix1 = _mm_add_epi16(pix1,
2694 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2695 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2696 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2697 pix1 = _mm_add_epi16(pix1,
2698 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2699 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2700 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2707 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2709 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2711 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2712 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2713 tci = _mm_madd_epi16(tci, tcoffset);
2714 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2715 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2719 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2720 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2721 tci = _mm_madd_epi16(tci, tcoffset);
2722 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2728 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2730 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2731 tci = _mm_and_si128(tci, tcmax);
2732 tci = _mm_madd_epi16(tci, tcoffset);
2733 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2734 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2738 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2739 tci = _mm_and_si128(tci, tcmax);
2740 tci = _mm_madd_epi16(tci, tcoffset);
2741 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2750 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2753 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2756 float DPSOFTRAST_SampleShadowmap(const float *vector)
2762 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2765 int startx = span->startx;
2766 int endx = span->endx;
2771 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2772 for (x = startx;x < endx;x++)
2775 c[0] = (data[0] + slope[0]*x) * z;
2776 c[1] = (data[1] + slope[1]*x) * z;
2777 c[2] = (data[2] + slope[2]*x) * z;
2778 c[3] = (data[3] + slope[3]*x) * z;
2779 out4f[x*4+0] = in4f[x*4+0] * c[0];
2780 out4f[x*4+1] = in4f[x*4+1] * c[1];
2781 out4f[x*4+2] = in4f[x*4+2] * c[2];
2782 out4f[x*4+3] = in4f[x*4+3] * c[3];
2786 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2789 int startx = span->startx;
2790 int endx = span->endx;
2795 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2796 for (x = startx;x < endx;x++)
2799 c[0] = (data[0] + slope[0]*x) * z;
2800 c[1] = (data[1] + slope[1]*x) * z;
2801 c[2] = (data[2] + slope[2]*x) * z;
2802 c[3] = (data[3] + slope[3]*x) * z;
2803 out4f[x*4+0] = c[0];
2804 out4f[x*4+1] = c[1];
2805 out4f[x*4+2] = c[2];
2806 out4f[x*4+3] = c[3];
2810 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2812 int x, startx = span->startx, endx = span->endx;
2813 float c[4], localcolor[4];
2814 localcolor[0] = subcolor[0];
2815 localcolor[1] = subcolor[1];
2816 localcolor[2] = subcolor[2];
2817 localcolor[3] = subcolor[3];
2818 for (x = startx;x < endx;x++)
2820 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2821 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2822 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2823 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2824 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2825 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2826 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2827 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2831 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2833 int x, startx = span->startx, endx = span->endx;
2834 for (x = startx;x < endx;x++)
2836 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2837 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2838 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2839 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2843 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2845 int x, startx = span->startx, endx = span->endx;
2846 for (x = startx;x < endx;x++)
2848 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2849 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2850 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2851 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2855 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2857 int x, startx = span->startx, endx = span->endx;
2859 for (x = startx;x < endx;x++)
2861 a = 1.0f - inb4f[x*4+3];
2863 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2864 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2865 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2866 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2870 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2872 int x, startx = span->startx, endx = span->endx;
2873 float localcolor[4], ilerp, lerp;
2874 localcolor[0] = color[0];
2875 localcolor[1] = color[1];
2876 localcolor[2] = color[2];
2877 localcolor[3] = color[3];
2878 ilerp = 1.0f - localcolor[3];
2879 lerp = localcolor[3];
2880 for (x = startx;x < endx;x++)
2882 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2883 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2884 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2885 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2891 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2895 int startx = span->startx;
2896 int endx = span->endx;
2899 __m128i submod, substep, endsubmod;
2900 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2901 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2902 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2903 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2904 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2905 for (x = startx; x < endx;)
2907 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2908 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2909 if (nextsub >= endx)
2911 nextsub = endsub = endx-1;
2912 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2916 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2917 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2918 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2919 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2920 substep = _mm_packs_epi32(substep, substep);
2921 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2923 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2924 pix = _mm_mulhi_epu16(pix, submod);
2925 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2929 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2930 pix = _mm_mulhi_epu16(pix, submod);
2931 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2938 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2942 int startx = span->startx;
2943 int endx = span->endx;
2946 __m128i submod, substep, endsubmod;
2947 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2948 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2949 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2950 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2951 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2952 for (x = startx; x < endx;)
2954 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2955 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2956 if (nextsub >= endx)
2958 nextsub = endsub = endx-1;
2959 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2963 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2964 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2965 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2966 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2967 substep = _mm_packs_epi32(substep, substep);
2968 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2970 __m128i pix = _mm_srai_epi16(submod, 4);
2971 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2975 __m128i pix = _mm_srai_epi16(submod, 4);
2976 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2983 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2986 int x, startx = span->startx, endx = span->endx;
2987 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2988 localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2989 for (x = startx;x+2 <= endx;x+=2)
2991 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2992 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2993 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2994 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2998 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2999 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3000 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
3001 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3006 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3009 int x, startx = span->startx, endx = span->endx;
3010 for (x = startx;x+2 <= endx;x+=2)
3012 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3013 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3014 pix1 = _mm_mulhi_epu16(pix1, pix2);
3015 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3019 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3020 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3021 pix1 = _mm_mulhi_epu16(pix1, pix2);
3022 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3027 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3030 int x, startx = span->startx, endx = span->endx;
3031 for (x = startx;x+2 <= endx;x+=2)
3033 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3034 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3035 pix1 = _mm_add_epi16(pix1, pix2);
3036 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3040 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3041 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3042 pix1 = _mm_add_epi16(pix1, pix2);
3043 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3048 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3051 int x, startx = span->startx, endx = span->endx;
3052 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3053 tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
3054 for (x = startx;x+2 <= endx;x+=2)
3056 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3057 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3058 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3059 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3063 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3064 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3065 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3066 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3071 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3074 int x, startx = span->startx, endx = span->endx;
3075 for (x = startx;x+2 <= endx;x+=2)
3077 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3078 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3079 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3080 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3081 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3085 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3086 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3087 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3088 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3089 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3094 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3097 int x, startx = span->startx, endx = span->endx;
3098 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3099 localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
3100 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3101 for (x = startx;x+2 <= endx;x+=2)
3103 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3104 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3105 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3109 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3110 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3111 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3118 void DPSOFTRAST_VertexShader_Generic(void)
3120 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3121 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3122 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3123 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3124 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3127 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3129 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3130 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3131 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3132 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3133 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3134 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3136 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3137 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3138 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3140 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3141 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3144 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3146 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3149 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3151 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3154 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3159 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3160 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3165 void DPSOFTRAST_VertexShader_PostProcess(void)
3167 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3168 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3169 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3172 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3174 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3175 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3176 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3177 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3178 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3179 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3180 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3182 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3183 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3185 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3186 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3188 // TODO: implement saturation
3190 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3192 // TODO: implement gammaramps
3194 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3199 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3201 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3204 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3206 // this is never called (because colormask is off when this shader is used)
3207 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3208 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3209 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3210 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3211 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3216 void DPSOFTRAST_VertexShader_FlatColor(void)
3218 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3219 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3222 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3225 unsigned char * RESTRICT pixelmask = span->pixelmask;
3226 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3227 int x, startx = span->startx, endx = span->endx;
3228 __m128i Color_Ambientm;
3229 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3230 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3231 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3232 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3233 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3234 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3235 pixel = buffer_FragColorbgra8;
3236 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3237 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3238 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3239 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3240 for (x = startx;x < endx;x++)
3243 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3246 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3247 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3248 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3249 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3255 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3256 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3257 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3259 if (pixel == buffer_FragColorbgra8)
3260 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3266 void DPSOFTRAST_VertexShader_VertexColor(void)
3268 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3269 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3270 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3273 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3276 unsigned char * RESTRICT pixelmask = span->pixelmask;
3277 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3278 int x, startx = span->startx, endx = span->endx;
3279 __m128i Color_Ambientm, Color_Diffusem;
3281 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3282 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3283 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3284 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3285 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3286 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3287 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3288 pixel = buffer_FragColorbgra8;
3289 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3290 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3291 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3292 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3293 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3294 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3295 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3296 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3297 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3298 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3299 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3300 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3301 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3302 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3304 __m128i color, mod, pix;
3305 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3308 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3309 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3310 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3311 data = _mm_add_ps(data, slope);
3312 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3313 data = _mm_add_ps(data, slope);
3314 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3315 data = _mm_add_ps(data, slope);
3316 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3317 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3318 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3319 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3320 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3321 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3327 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3328 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3329 mod = _mm_packs_epi32(mod, mod);
3330 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3331 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3333 if (pixel == buffer_FragColorbgra8)
3334 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3340 void DPSOFTRAST_VertexShader_Lightmap(void)
3342 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3343 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3344 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3347 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3350 unsigned char * RESTRICT pixelmask = span->pixelmask;
3351 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3352 int x, startx = span->startx, endx = span->endx;
3353 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3354 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3355 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3356 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3357 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3358 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3359 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3360 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3361 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3362 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3363 pixel = buffer_FragColorbgra8;
3364 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3365 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3366 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3367 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3368 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3369 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3370 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3371 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3373 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3374 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3375 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3376 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3377 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3378 for (x = startx;x < endx;x++)
3380 __m128i color, lightmap, glow, pix;
3381 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3384 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3385 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3386 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3387 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3388 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3389 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3390 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3391 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3392 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3393 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3399 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3400 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3401 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3402 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3403 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3404 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3409 for (x = startx;x < endx;x++)
3411 __m128i color, lightmap, pix;
3412 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3415 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3416 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3417 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3418 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3419 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3420 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3421 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3427 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3428 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3429 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3430 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3433 if (pixel == buffer_FragColorbgra8)
3434 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3440 void DPSOFTRAST_VertexShader_FakeLight(void)
3442 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3445 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3448 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3449 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3450 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3451 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3452 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3457 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3459 DPSOFTRAST_VertexShader_Lightmap();
3462 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3464 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3470 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3472 DPSOFTRAST_VertexShader_Lightmap();
3475 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3477 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3483 void DPSOFTRAST_VertexShader_LightDirection(void)
3486 int numvertices = dpsoftrast.numvertices;
3488 float LightVector[4];
3489 float EyePosition[4];
3490 float EyeVectorModelSpace[4];
3496 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3497 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3498 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3499 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3500 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3501 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3502 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3503 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3504 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3505 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3506 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3507 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3508 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3509 for (i = 0;i < numvertices;i++)
3511 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3512 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3513 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3514 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3515 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3516 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3517 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3518 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3519 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3520 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3521 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3522 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3523 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3524 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3525 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3526 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3527 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3528 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3529 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3530 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3531 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3532 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3533 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3534 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3535 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3536 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3537 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3538 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3539 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3541 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3544 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3545 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3546 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3547 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3548 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3549 #define DPSOFTRAST_Vector3Normalize(v)\
3552 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3563 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3565 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3566 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3567 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3568 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3569 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3570 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3571 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3572 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3573 int x, startx = span->startx, endx = span->endx;
3574 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3575 float LightVectordata[4];
3576 float LightVectorslope[4];
3577 float EyeVectordata[4];
3578 float EyeVectorslope[4];
3580 float diffusetex[4];
3582 float surfacenormal[4];
3583 float lightnormal[4];
3585 float specularnormal[4];
3588 float SpecularPower;
3590 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3591 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3592 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3593 Color_Glow[3] = 0.0f;
3594 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3595 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3596 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3597 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3598 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3599 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3600 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3601 Color_Pants[3] = 0.0f;
3602 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3603 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3604 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3605 Color_Shirt[3] = 0.0f;
3606 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3607 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3608 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3610 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3611 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3613 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3615 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3617 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3619 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3620 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3621 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3622 Color_Diffuse[3] = 0.0f;
3623 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3624 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3625 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3626 LightColor[3] = 0.0f;
3627 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3628 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3629 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3630 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3631 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3632 Color_Specular[3] = 0.0f;
3633 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3634 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3635 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3636 for (x = startx;x < endx;x++)
3639 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3640 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3641 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3642 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3643 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3645 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3646 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3647 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3648 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3650 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3651 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3652 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3653 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3654 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3655 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3656 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3657 DPSOFTRAST_Vector3Normalize(surfacenormal);
3659 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3660 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3661 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3662 DPSOFTRAST_Vector3Normalize(lightnormal);
3664 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3665 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3666 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3667 DPSOFTRAST_Vector3Normalize(eyenormal);
3669 specularnormal[0] = lightnormal[0] + eyenormal[0];
3670 specularnormal[1] = lightnormal[1] + eyenormal[1];
3671 specularnormal[2] = lightnormal[2] + eyenormal[2];
3672 DPSOFTRAST_Vector3Normalize(specularnormal);
3674 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3675 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3676 specular = pow(specular, SpecularPower * glosstex[3]);
3677 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3679 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3680 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3681 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3682 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3686 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3687 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3688 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3689 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3691 buffer_FragColorbgra8[x*4+0] = d[0];
3692 buffer_FragColorbgra8[x*4+1] = d[1];
3693 buffer_FragColorbgra8[x*4+2] = d[2];
3694 buffer_FragColorbgra8[x*4+3] = d[3];
3697 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3699 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3700 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3701 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3702 Color_Diffuse[3] = 0.0f;
3703 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3704 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3705 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3706 LightColor[3] = 0.0f;
3707 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3708 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3709 for (x = startx;x < endx;x++)
3712 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3713 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3714 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3715 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3716 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3717 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3718 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3719 DPSOFTRAST_Vector3Normalize(surfacenormal);
3721 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3722 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3723 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3724 DPSOFTRAST_Vector3Normalize(lightnormal);
3726 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3727 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3729 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3730 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3731 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3732 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3736 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3737 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3738 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3739 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3741 buffer_FragColorbgra8[x*4+0] = d[0];
3742 buffer_FragColorbgra8[x*4+1] = d[1];
3743 buffer_FragColorbgra8[x*4+2] = d[2];
3744 buffer_FragColorbgra8[x*4+3] = d[3];
3749 for (x = startx;x < endx;x++)
3752 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3753 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3754 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3755 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3757 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3759 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3760 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3761 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3762 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3766 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3767 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3768 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3769 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3771 buffer_FragColorbgra8[x*4+0] = d[0];
3772 buffer_FragColorbgra8[x*4+1] = d[1];
3773 buffer_FragColorbgra8[x*4+2] = d[2];
3774 buffer_FragColorbgra8[x*4+3] = d[3];
3777 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3782 void DPSOFTRAST_VertexShader_LightSource(void)
3785 int numvertices = dpsoftrast.numvertices;
3786 float LightPosition[4];
3787 float LightVector[4];
3788 float LightVectorModelSpace[4];
3789 float EyePosition[4];
3790 float EyeVectorModelSpace[4];
3796 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3797 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3798 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3799 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3800 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3801 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3802 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3803 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3804 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3805 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3806 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3807 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3808 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3809 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3810 for (i = 0;i < numvertices;i++)
3812 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3813 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3814 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3815 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3816 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3817 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3818 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3819 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3820 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3821 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3822 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3823 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3824 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3825 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3826 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3827 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3828 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3829 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
3830 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3831 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3832 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3833 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3834 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3835 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3836 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3837 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3838 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3839 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3840 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3841 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3842 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3843 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3845 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3846 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3849 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3852 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3853 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3854 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3855 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3856 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3857 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3858 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3859 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3860 int x, startx = span->startx, endx = span->endx;
3861 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3862 float CubeVectordata[4];
3863 float CubeVectorslope[4];
3864 float LightVectordata[4];
3865 float LightVectorslope[4];
3866 float EyeVectordata[4];
3867 float EyeVectorslope[4];
3869 float diffusetex[4];
3871 float surfacenormal[4];
3872 float lightnormal[4];
3874 float specularnormal[4];
3877 float SpecularPower;
3878 float CubeVector[4];
3881 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3882 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3883 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3884 Color_Glow[3] = 0.0f;
3885 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3886 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3887 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3888 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3889 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3890 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3891 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3892 Color_Diffuse[3] = 0.0f;
3893 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3894 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3895 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3896 Color_Specular[3] = 0.0f;
3897 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3898 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3899 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3900 Color_Pants[3] = 0.0f;
3901 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3902 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3903 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3904 Color_Shirt[3] = 0.0f;
3905 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3906 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3907 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3908 LightColor[3] = 0.0f;
3909 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3910 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3911 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3912 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3913 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3914 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
3915 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3916 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3918 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3919 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3921 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3922 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
3923 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3925 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3926 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3927 for (x = startx;x < endx;x++)
3930 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3931 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3932 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3933 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3934 if (attenuation < 0.01f)
3936 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3938 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3939 if (attenuation < 0.01f)
3943 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3944 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3945 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3946 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3947 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3949 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3950 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3951 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3952 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3954 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3955 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3956 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3957 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3958 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3959 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3960 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3961 DPSOFTRAST_Vector3Normalize(surfacenormal);
3963 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3964 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3965 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3966 DPSOFTRAST_Vector3Normalize(lightnormal);
3968 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3969 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3970 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3971 DPSOFTRAST_Vector3Normalize(eyenormal);
3973 specularnormal[0] = lightnormal[0] + eyenormal[0];
3974 specularnormal[1] = lightnormal[1] + eyenormal[1];
3975 specularnormal[2] = lightnormal[2] + eyenormal[2];
3976 DPSOFTRAST_Vector3Normalize(specularnormal);
3978 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3979 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3980 specular = pow(specular, SpecularPower * glosstex[3]);
3981 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3983 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3984 attenuation *= (1.0f / 255.0f);
3985 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3986 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3987 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3988 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3992 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
3993 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
3994 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
3995 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3997 buffer_FragColorbgra8[x*4+0] = d[0];
3998 buffer_FragColorbgra8[x*4+1] = d[1];
3999 buffer_FragColorbgra8[x*4+2] = d[2];
4000 buffer_FragColorbgra8[x*4+3] = d[3];
4003 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4005 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4006 for (x = startx;x < endx;x++)
4009 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4010 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4011 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4012 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4013 if (attenuation < 0.01f)
4015 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4017 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4018 if (attenuation < 0.01f)
4022 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4023 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4024 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4025 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4026 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4028 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4029 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4030 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4031 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4033 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4034 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4035 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4036 DPSOFTRAST_Vector3Normalize(surfacenormal);
4038 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4039 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4040 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4041 DPSOFTRAST_Vector3Normalize(lightnormal);
4043 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4044 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4046 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4047 attenuation *= (1.0f / 255.0f);
4048 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4049 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4050 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4051 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4055 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4056 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4057 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4058 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4060 buffer_FragColorbgra8[x*4+0] = d[0];
4061 buffer_FragColorbgra8[x*4+1] = d[1];
4062 buffer_FragColorbgra8[x*4+2] = d[2];
4063 buffer_FragColorbgra8[x*4+3] = d[3];
4068 for (x = startx;x < endx;x++)
4071 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4072 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4073 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4074 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4075 if (attenuation < 0.01f)
4077 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4079 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4080 if (attenuation < 0.01f)
4084 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4085 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4086 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4087 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4088 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4090 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4091 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4092 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4093 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4095 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4097 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4098 attenuation *= (1.0f / 255.0f);
4099 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4100 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4101 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4102 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4106 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4107 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4108 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4109 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4111 buffer_FragColorbgra8[x*4+0] = d[0];
4112 buffer_FragColorbgra8[x*4+1] = d[1];
4113 buffer_FragColorbgra8[x*4+2] = d[2];
4114 buffer_FragColorbgra8[x*4+3] = d[3];
4117 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4123 void DPSOFTRAST_VertexShader_Refraction(void)
4125 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4128 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4131 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4132 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4133 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4134 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4135 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4140 void DPSOFTRAST_VertexShader_Water(void)
4142 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4146 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4149 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4150 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4151 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4152 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4153 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4158 void DPSOFTRAST_VertexShader_ShowDepth(void)
4160 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4163 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4166 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4167 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4168 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4169 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4170 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4175 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4177 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4180 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4183 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4184 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4185 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4186 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4187 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4192 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4194 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4197 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4200 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4201 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4202 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4203 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4204 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4209 typedef struct DPSOFTRAST_ShaderModeInfo_s
4212 void (*Vertex)(void);
4213 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4214 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4215 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4217 DPSOFTRAST_ShaderModeInfo;
4219 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4221 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4222 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4223 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4224 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4225 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4226 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4227 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {~0}, {~0}},
4228 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4229 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4230 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4231 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4232 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {~0}},
4233 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4234 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4235 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4236 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}}
4239 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4246 // unsigned int *colorpixel;
4247 unsigned int *depthpixel;
4253 DPSOFTRAST_State_Triangle *triangle;
4254 DPSOFTRAST_State_Span *span;
4255 unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4256 for (i = 0; i < thread->numspans; i++)
4258 span = &thread->spans[i];
4259 triangle = &thread->triangles[span->triangle];
4260 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4262 wslope = triangle->w[0];
4263 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4264 depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4265 depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4266 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4267 startx = span->startx;
4269 switch(thread->fb_depthfunc)
4272 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4273 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4274 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4275 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4276 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4277 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4278 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4280 //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4281 //for (x = startx;x < endx;x++)
4282 // colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4283 // if there is no color buffer, skip pixel shader
4284 while (startx < endx && !pixelmask[startx])
4286 while (endx > startx && !pixelmask[endx-1])
4289 continue; // no pixels to fill
4290 span->pixelmask = pixelmask;
4291 span->startx = startx;
4293 // run pixel shader if appropriate
4294 // do this before running depthmask code, to allow the pixelshader
4295 // to clear pixelmask values for alpha testing
4296 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4297 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4298 if (thread->depthmask)
4299 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4305 // no depth testing means we're just dealing with color...
4306 // if there is no color buffer, skip pixel shader
4307 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4309 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4310 span->pixelmask = pixelmask;
4311 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4315 thread->numspans = 0;
4318 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4320 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4323 int cullface = thread->cullface;
4324 int minx, maxx, miny, maxy;
4325 int miny1, maxy1, miny2, maxy2;
4326 __m128i fbmin, fbmax;
4327 __m128 viewportcenter, viewportscale;
4328 int firstvertex = command->firstvertex;
4329 int numvertices = command->numvertices;
4330 int numtriangles = command->numtriangles;
4331 const int *element3i = command->element3i;
4332 const unsigned short *element3s = command->element3s;
4333 int clipped = command->clipped;
4340 int starty, endy, bandy;
4344 __m128 triangleedge1, triangleedge2, trianglenormal;
4347 DPSOFTRAST_State_Triangle *triangle;
4348 DPSOFTRAST_Texture *texture;
4349 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4350 miny = thread->fb_scissor[1];
4351 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4352 miny1 = bound(miny, thread->miny1, maxy);
4353 maxy1 = bound(miny, thread->maxy1, maxy);
4354 miny2 = bound(miny, thread->miny2, maxy);
4355 maxy2 = bound(miny, thread->maxy2, maxy);
4356 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4358 if (!ATOMIC_DECREMENT(command->refcount))
4360 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4361 MM_FREE(command->arrays);
4365 minx = thread->fb_scissor[0];
4366 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4367 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4368 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4369 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4370 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4371 screen[3] = _mm_setzero_ps();
4372 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4373 for (i = 0;i < numtriangles;i++)
4375 const float *screencoord4f = command->arrays;
4376 const float *arrays = screencoord4f + numvertices*4;
4378 // generate the 3 edges of this triangle
4379 // generate spans for the triangle - switch based on left split or right split classification of triangle
4382 e[0] = element3s[i*3+0] - firstvertex;
4383 e[1] = element3s[i*3+1] - firstvertex;
4384 e[2] = element3s[i*3+2] - firstvertex;
4388 e[0] = element3i[i*3+0] - firstvertex;
4389 e[1] = element3i[i*3+1] - firstvertex;
4390 e[2] = element3i[i*3+2] - firstvertex;
4399 #define SKIPBACKFACE \
4400 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4401 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4402 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4403 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4404 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4408 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4412 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4417 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4418 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4420 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4421 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4423 #define CLIPPEDVERTEXCOPY(k,p1) \
4424 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4426 #define GENATTRIBCOPY(attrib, p1) \
4427 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4428 #define GENATTRIBLERP(attrib, p1, p2) \
4430 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4431 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4433 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4437 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4438 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4439 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4440 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4441 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4442 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4443 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4449 // calculate distance from nearplane
4450 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4451 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4452 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4453 if (clipdist[0] >= 0.0f)
4455 if (clipdist[1] >= 0.0f)
4457 if (clipdist[2] >= 0.0f)
4460 // triangle is entirely in front of nearplane
4461 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4468 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4476 if (clipdist[2] >= 0.0f)
4478 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4485 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4492 else if (clipdist[1] >= 0.0f)
4494 if (clipdist[2] >= 0.0f)
4496 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4503 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4509 else if (clipdist[2] >= 0.0f)
4511 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4516 else continue; // triangle is entirely behind nearplane
4519 // calculate integer y coords for triangle points
4520 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4521 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4522 screenmin = _mm_min_epi16(screeni, screenir),
4523 screenmax = _mm_max_epi16(screeni, screenir);
4524 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4525 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4526 screenmin = _mm_max_epi16(screenmin, fbmin);
4527 screenmax = _mm_min_epi16(screenmax, fbmax);
4528 // skip offscreen triangles
4529 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4531 starty = _mm_extract_epi16(screenmin, 1);
4532 endy = _mm_extract_epi16(screenmax, 1)+1;
4533 if (starty >= maxy1 && endy <= miny2)
4535 screeny = _mm_srai_epi32(screeni, 16);
4538 triangle = &thread->triangles[thread->numtriangles];
4540 // calculate attribute plans for triangle data...
4541 // okay, this triangle is going to produce spans, we'd better project
4542 // the interpolants now (this is what gives perspective texturing),
4543 // this consists of simply multiplying all arrays by the W coord
4544 // (which is basically 1/Z), which will be undone per-pixel
4545 // (multiplying by Z again) to get the perspective-correct array
4548 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4549 __m128 mipedgescale, mipdensity;
4550 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4551 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4552 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4553 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4554 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4555 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4556 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4557 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4558 attribedge1 = _mm_sub_ss(w0, w1);
4559 attribedge2 = _mm_sub_ss(w2, w1);
4560 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4561 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4562 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4563 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4564 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4565 _mm_store_ss(&triangle->w[0], attribxslope);
4566 _mm_store_ss(&triangle->w[1], attribyslope);
4567 _mm_store_ss(&triangle->w[2], attriborigin);
4568 mipedgescale = _mm_setzero_ps();
4569 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4571 __m128 attrib0, attrib1, attrib2;
4572 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4573 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4575 arrays += numvertices*4;
4576 GENATTRIBS(attrib0, attrib1, attrib2);
4577 attriborigin = _mm_mul_ps(attrib1, w1);
4578 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4579 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4580 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4581 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4582 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4583 _mm_stream_ps(triangle->attribs[k][0], attribxslope);
4584 _mm_stream_ps(triangle->attribs[k][1], attribyslope);
4585 _mm_stream_ps(triangle->attribs[k][2], attriborigin);
4586 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4588 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4589 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4590 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4591 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4595 memset(triangle->mip, 0, sizeof(triangle->mip));
4596 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4598 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4599 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4601 texture = thread->texbound[texunit];
4602 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4604 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4605 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4606 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4607 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4608 // this will be multiplied in the texturing routine by the texture resolution
4609 y = _mm_cvtss_si32(mipdensity);
4612 y = (int)(log((float)y)*0.5f/M_LN2);
4613 if (y > texture->mipmaps - 1)
4614 y = texture->mipmaps - 1;
4615 triangle->mip[texunit] = y;
4621 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4624 __m128 xcoords, xslope;
4625 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4626 int yccmask = _mm_movemask_epi8(ycc);
4627 int edge0p, edge0n, edge1p, edge1n;
4634 case 0xFFFF: /*0000*/ y = endy; continue;
4635 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4636 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4637 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4638 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4639 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4640 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4641 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4642 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4643 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4644 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4645 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4646 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4647 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4648 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4649 case 0x0000: /*1111*/ y++; continue;
4657 case 0xFFFF: /*000*/ y = endy; continue;
4658 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4659 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4660 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4661 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4662 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4663 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4664 case 0x0000: /*111*/ y++; continue;
4667 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
4668 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4669 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4670 nexty = _mm_extract_epi16(ycc, 0);
4671 if (nexty >= bandy) nexty = bandy-1;
4672 if (_mm_ucomigt_ss(_mm_max_ss(screen[edge0n], screen[edge0p]), _mm_min_ss(screen[edge1n], screen[edge1p])))
4681 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
4682 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4683 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
4684 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4685 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4686 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4688 int startx, endx, offset;
4689 startx = _mm_cvtss_si32(xcoords);
4690 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4693 if (startx < 0) startx = 0;
4694 startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
4696 if (endx > maxx) endx = maxx;
4697 if (startx >= endx) continue;
4698 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
4700 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4701 span->triangle = thread->numtriangles;
4704 span->startx = max(minx - offset, 0);
4705 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
4706 if (span->startx >= span->endx)
4708 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4709 DPSOFTRAST_Draw_ProcessSpans(thread);
4714 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
4716 DPSOFTRAST_Draw_ProcessSpans(thread);
4717 thread->numtriangles = 0;
4721 if (!ATOMIC_DECREMENT(command->refcount))
4723 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4724 MM_FREE(command->arrays);
4727 if (thread->numspans > 0 || thread->numtriangles > 0)
4729 DPSOFTRAST_Draw_ProcessSpans(thread);
4730 thread->numtriangles = 0;
4735 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4739 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
4740 int datasize = 2*numvertices*sizeof(float[4]);
4741 DPSOFTRAST_Command_Draw *command;
4742 unsigned char *data;
4743 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4745 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4746 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4748 datasize += numvertices*sizeof(float[4]);
4751 datasize += numtriangles*sizeof(unsigned short[3]);
4753 datasize += numtriangles*sizeof(int[3]);
4754 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
4755 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
4757 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
4758 data = (unsigned char *)MM_CALLOC(datasize, 1);
4762 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
4763 data = (unsigned char *)command + commandsize;
4765 command->firstvertex = firstvertex;
4766 command->numvertices = numvertices;
4767 command->numtriangles = numtriangles;
4768 command->arrays = (float *)data;
4769 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
4770 dpsoftrast.firstvertex = firstvertex;
4771 dpsoftrast.numvertices = numvertices;
4772 dpsoftrast.screencoord4f = (float *)data;
4773 data += numvertices*sizeof(float[4]);
4774 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
4775 data += numvertices*sizeof(float[4]);
4776 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4778 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4779 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4781 dpsoftrast.post_array4f[j] = (float *)data;
4782 data += numvertices*sizeof(float[4]);
4784 command->element3i = NULL;
4785 command->element3s = NULL;
4788 command->element3s = (unsigned short *)data;
4789 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
4793 command->element3i = (int *)data;
4794 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
4799 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4801 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
4802 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4803 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
4804 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
4805 if (command->starty >= command->endy)
4807 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4808 MM_FREE(command->arrays);
4809 DPSOFTRAST_UndoCommand(command->commandsize);
4812 command->clipped = dpsoftrast.drawclipped;
4813 command->refcount = dpsoftrast.numthreads;
4816 DPSOFTRAST_Draw_SyncCommands();
4819 for (i = 0; i < dpsoftrast.numthreads; i++)
4821 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4822 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
4823 SDL_CondSignal(thread->drawcond);
4827 DPSOFTRAST_Draw_FlushThreads();
4831 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
4833 int commandoffset = thread->commandoffset;
4834 while (commandoffset != endoffset)
4836 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4837 switch (command->opcode)
4839 #define INTERPCOMMAND(name) \
4840 case DPSOFTRAST_OPCODE_##name : \
4841 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4842 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
4843 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4844 commandoffset = 0; \
4846 INTERPCOMMAND(Viewport)
4847 INTERPCOMMAND(ClearColor)
4848 INTERPCOMMAND(ClearDepth)
4849 INTERPCOMMAND(ColorMask)
4850 INTERPCOMMAND(DepthTest)
4851 INTERPCOMMAND(ScissorTest)
4852 INTERPCOMMAND(Scissor)
4853 INTERPCOMMAND(BlendFunc)
4854 INTERPCOMMAND(BlendSubtract)
4855 INTERPCOMMAND(DepthMask)
4856 INTERPCOMMAND(DepthFunc)
4857 INTERPCOMMAND(DepthRange)
4858 INTERPCOMMAND(PolygonOffset)
4859 INTERPCOMMAND(CullFace)
4860 INTERPCOMMAND(AlphaTest)
4861 INTERPCOMMAND(AlphaFunc)
4862 INTERPCOMMAND(SetTexture)
4863 INTERPCOMMAND(SetShader)
4864 INTERPCOMMAND(Uniform4f)
4865 INTERPCOMMAND(UniformMatrix4f)
4866 INTERPCOMMAND(Uniform1i)
4868 case DPSOFTRAST_OPCODE_Draw:
4869 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
4870 commandoffset += command->commandsize;
4871 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
4873 thread->commandoffset = commandoffset;
4876 case DPSOFTRAST_OPCODE_Reset:
4881 thread->commandoffset = commandoffset;
4885 static int DPSOFTRAST_Draw_Thread(void *data)
4887 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4888 while(thread->index >= 0)
4890 if (thread->commandoffset != dpsoftrast.drawcommand)
4892 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4896 SDL_LockMutex(thread->drawmutex);
4897 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
4899 if (thread->waiting) SDL_CondSignal(thread->waitcond);
4900 thread->starving = true;
4901 SDL_CondWait(thread->drawcond, thread->drawmutex);
4902 thread->starving = false;
4904 SDL_UnlockMutex(thread->drawmutex);
4911 static void DPSOFTRAST_Draw_FlushThreads(void)
4913 DPSOFTRAST_State_Thread *thread;
4915 DPSOFTRAST_Draw_SyncCommands();
4917 for (i = 0; i < dpsoftrast.numthreads; i++)
4919 thread = &dpsoftrast.threads[i];
4920 if (thread->commandoffset != dpsoftrast.drawcommand)
4922 SDL_LockMutex(thread->drawmutex);
4923 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
4924 SDL_CondSignal(thread->drawcond);
4925 SDL_UnlockMutex(thread->drawmutex);
4929 for (i = 0; i < dpsoftrast.numthreads; i++)
4931 thread = &dpsoftrast.threads[i];
4933 if (thread->commandoffset != dpsoftrast.drawcommand)
4935 SDL_LockMutex(thread->drawmutex);
4936 if (thread->commandoffset != dpsoftrast.drawcommand)
4938 thread->waiting = true;
4939 SDL_CondWait(thread->waitcond, thread->drawmutex);
4940 thread->waiting = false;
4942 SDL_UnlockMutex(thread->drawmutex);
4945 if (thread->commandoffset != dpsoftrast.drawcommand)
4946 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4949 dpsoftrast.commandpool.usedcommands = 0;
4952 void DPSOFTRAST_Flush(void)
4954 DPSOFTRAST_Draw_FlushThreads();
4957 void DPSOFTRAST_Finish(void)
4962 void DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
4972 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4973 dpsoftrast.bigendian = u.b[3];
4974 dpsoftrast.fb_width = width;
4975 dpsoftrast.fb_height = height;
4976 dpsoftrast.fb_depthpixels = depthpixels;
4977 dpsoftrast.fb_colorpixels[0] = colorpixels;
4978 dpsoftrast.fb_colorpixels[1] = NULL;
4979 dpsoftrast.fb_colorpixels[1] = NULL;
4980 dpsoftrast.fb_colorpixels[1] = NULL;
4981 dpsoftrast.viewport[0] = 0;
4982 dpsoftrast.viewport[1] = 0;
4983 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
4984 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
4985 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
4986 dpsoftrast.texture_firstfree = 1;
4987 dpsoftrast.texture_end = 1;
4988 dpsoftrast.texture_max = 0;
4989 dpsoftrast.color[0] = 1;
4990 dpsoftrast.color[1] = 1;
4991 dpsoftrast.color[2] = 1;
4992 dpsoftrast.color[3] = 1;
4993 dpsoftrast.interlace = bound(0, interlace, 1);
4995 dpsoftrast.numthreads = bound(1, numthreads, 64);
4997 dpsoftrast.numthreads = 1;
4999 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5000 for (i = 0; i < dpsoftrast.numthreads; i++)
5002 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5004 thread->cullface = GL_BACK;
5005 thread->colormask[1] = 1;
5006 thread->colormask[2] = 1;
5007 thread->colormask[3] = 1;
5008 thread->blendfunc[0] = GL_ONE;
5009 thread->blendfunc[1] = GL_ZERO;
5010 thread->depthmask = true;
5011 thread->depthtest = true;
5012 thread->depthfunc = GL_LEQUAL;
5013 thread->scissortest = false;
5014 thread->alphatest = false;
5015 thread->alphafunc = GL_GREATER;
5016 thread->alphavalue = 0.5f;
5017 thread->viewport[0] = 0;
5018 thread->viewport[1] = 0;
5019 thread->viewport[2] = dpsoftrast.fb_width;
5020 thread->viewport[3] = dpsoftrast.fb_height;
5021 thread->scissor[0] = 0;
5022 thread->scissor[1] = 0;
5023 thread->scissor[2] = dpsoftrast.fb_width;
5024 thread->scissor[3] = dpsoftrast.fb_height;
5025 thread->depthrange[0] = 0;
5026 thread->depthrange[1] = 1;
5027 thread->polygonoffset[0] = 0;
5028 thread->polygonoffset[1] = 0;
5030 if (dpsoftrast.interlace)
5032 thread->miny1 = (i*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5033 thread->maxy1 = ((i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5034 thread->miny2 = ((dpsoftrast.numthreads+i)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5035 thread->maxy2 = ((dpsoftrast.numthreads+i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5039 thread->miny1 = thread->miny2 = (i*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5040 thread->maxy1 = thread->maxy2 = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5043 thread->numspans = 0;
5044 thread->numtriangles = 0;
5045 thread->commandoffset = 0;
5046 thread->waiting = false;
5047 thread->starving = false;
5049 thread->waitcond = SDL_CreateCond();
5050 thread->drawcond = SDL_CreateCond();
5051 thread->drawmutex = SDL_CreateMutex();
5054 thread->validate = -1;
5055 DPSOFTRAST_Validate(thread, -1);
5057 thread->thread = SDL_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5062 void DPSOFTRAST_Shutdown(void)
5066 if (dpsoftrast.numthreads > 0)
5068 DPSOFTRAST_State_Thread *thread;
5069 for (i = 0; i < dpsoftrast.numthreads; i++)
5071 thread = &dpsoftrast.threads[i];
5072 SDL_LockMutex(thread->drawmutex);
5074 SDL_CondSignal(thread->drawcond);
5075 SDL_UnlockMutex(thread->drawmutex);
5076 SDL_WaitThread(thread->thread, NULL);
5077 SDL_DestroyCond(thread->waitcond);
5078 SDL_DestroyCond(thread->drawcond);
5079 SDL_DestroyMutex(thread->drawmutex);
5083 for (i = 0;i < dpsoftrast.texture_end;i++)
5084 if (dpsoftrast.texture[i].bytes)
5085 MM_FREE(dpsoftrast.texture[i].bytes);
5086 if (dpsoftrast.texture)
5087 free(dpsoftrast.texture);
5088 if (dpsoftrast.threads)
5089 MM_FREE(dpsoftrast.threads);
5090 memset(&dpsoftrast, 0, sizeof(dpsoftrast));