3 #define _USE_MATH_DEFINES
6 #include "dpsoftrast.h"
13 typedef qboolean bool;
17 #define ATOMIC_SIZE 32
21 #define ALIGN(var) var __attribute__((__aligned__(16)))
22 #define ATOMIC(var) var __attribute__((__aligned__(32)))
24 #define MEMORY_BARRIER (_mm_sfence())
25 //(__sync_synchronize())
26 #define ATOMIC_COUNTER volatile int
27 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
28 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
29 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
31 #elif defined(_MSC_VER)
32 #define ALIGN(var) __declspec(align(16)) var
33 #define ATOMIC(var) __declspec(align(32)) var
35 #define MEMORY_BARRIER (_mm_sfence())
37 #define ATOMIC_COUNTER volatile LONG
38 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
39 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
40 #define ATOMIC_ADD(counter, val) (InterlockedExchangeAdd(&(counter), (val)))
49 #define ALIGN(var) var
50 #define ATOMIC(var) var
55 #include <SDL_thread.h>
57 #define MEMORY_BARRIER ((void)0)
58 #define ATOMIC_COUNTER int
59 #define ATOMIC_INCREMENT(counter) (++(counter))
60 #define ATOMIC_DECREMENT(counter) (--(counter))
61 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
62 typedef void SDL_Thread;
63 typedef void SDL_cond;
64 typedef void SDL_mutex;
68 #include <emmintrin.h>
70 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
72 static void *MM_CALLOC(size_t nmemb, size_t size)
74 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
75 if (ptr != NULL) memset(ptr, 0, nmemb*size);
79 #define MM_FREE _mm_free
81 #define MM_MALLOC(size) malloc(size)
82 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
86 typedef enum DPSOFTRAST_ARRAY_e
88 DPSOFTRAST_ARRAY_POSITION,
89 DPSOFTRAST_ARRAY_COLOR,
90 DPSOFTRAST_ARRAY_TEXCOORD0,
91 DPSOFTRAST_ARRAY_TEXCOORD1,
92 DPSOFTRAST_ARRAY_TEXCOORD2,
93 DPSOFTRAST_ARRAY_TEXCOORD3,
94 DPSOFTRAST_ARRAY_TEXCOORD4,
95 DPSOFTRAST_ARRAY_TEXCOORD5,
96 DPSOFTRAST_ARRAY_TEXCOORD6,
97 DPSOFTRAST_ARRAY_TEXCOORD7,
98 DPSOFTRAST_ARRAY_TOTAL
102 typedef struct DPSOFTRAST_Texture_s
109 DPSOFTRAST_TEXTURE_FILTER filter;
112 ATOMIC_COUNTER binds;
113 unsigned char *bytes;
114 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
118 #define COMMAND_SIZE ALIGN_SIZE
119 #define COMMAND_ALIGN(var) ALIGN(var)
121 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
123 unsigned char opcode;
124 unsigned short commandsize;
128 enum { DPSOFTRAST_OPCODE_Reset = 0 };
130 #define DEFCOMMAND(opcodeval, name, fields) \
131 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
132 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
134 unsigned char opcode; \
135 unsigned short commandsize; \
137 } DPSOFTRAST_Command_##name );
139 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
140 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
142 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
146 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
148 DPSOFTRAST_State_Command_Pool);
150 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
152 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
154 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
156 DPSOFTRAST_State_Triangle);
158 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
159 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
160 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
161 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
162 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
164 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
165 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
166 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
167 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
168 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
169 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
170 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
171 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
172 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
175 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
177 typedef ALIGN(struct DPSOFTRAST_State_Span_s
179 int triangle; // triangle this span was generated by
180 int x; // framebuffer x coord
181 int y; // framebuffer y coord
182 int length; // pixel count
183 int startx; // usable range (according to pixelmask)
184 int endx; // usable range (according to pixelmask)
185 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
187 DPSOFTRAST_State_Span);
189 #define DPSOFTRAST_DRAW_MAXSPANS 1024
190 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
192 #define DPSOFTRAST_VALIDATE_FB 1
193 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
194 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
195 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
197 typedef enum DPSOFTRAST_BLENDMODE_e
199 DPSOFTRAST_BLENDMODE_OPAQUE,
200 DPSOFTRAST_BLENDMODE_ALPHA,
201 DPSOFTRAST_BLENDMODE_ADDALPHA,
202 DPSOFTRAST_BLENDMODE_ADD,
203 DPSOFTRAST_BLENDMODE_INVMOD,
204 DPSOFTRAST_BLENDMODE_MUL,
205 DPSOFTRAST_BLENDMODE_MUL2,
206 DPSOFTRAST_BLENDMODE_SUBALPHA,
207 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
208 DPSOFTRAST_BLENDMODE_TOTAL
210 DPSOFTRAST_BLENDMODE;
212 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
231 float polygonoffset[2];
234 int shader_permutation;
236 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
238 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
239 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
241 // DPSOFTRAST_VALIDATE_ flags
244 // derived values (DPSOFTRAST_VALIDATE_FB)
246 int fb_clearscissor[4];
247 ALIGN(float fb_viewportcenter[4]);
248 ALIGN(float fb_viewportscale[4]);
250 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
253 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
262 ATOMIC(volatile int commandoffset);
264 volatile bool waiting;
265 volatile bool starving;
268 SDL_mutex *drawmutex;
272 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
273 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
275 DPSOFTRAST_State_Thread);
277 typedef ATOMIC(struct DPSOFTRAST_State_s
281 unsigned int *fb_depthpixels;
282 unsigned int *fb_colorpixels[4];
285 ALIGN(float fb_viewportcenter[4]);
286 ALIGN(float fb_viewportscale[4]);
289 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
290 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
292 const float *pointer_vertex3f;
293 const float *pointer_color4f;
294 const unsigned char *pointer_color4ub;
295 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
298 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
299 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
300 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
304 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
305 float *screencoord4f;
311 int shader_permutation;
315 int texture_firstfree;
316 DPSOFTRAST_Texture *texture;
321 const char *errorstring;
325 DPSOFTRAST_State_Thread *threads;
327 ATOMIC(volatile int drawcommand);
329 DPSOFTRAST_State_Command_Pool commandpool;
333 DPSOFTRAST_State dpsoftrast;
335 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
336 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
337 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
338 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
339 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
341 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
343 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
344 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
345 fb_viewportcenter[3] = 0.5f;
346 fb_viewportcenter[0] = 0.0f;
347 fb_viewportscale[1] = 0.5f * viewport[2];
348 fb_viewportscale[2] = -0.5f * viewport[3];
349 fb_viewportscale[3] = 0.5f;
350 fb_viewportscale[0] = 1.0f;
353 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
355 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
356 // and viewport projection values
359 x1 = thread->scissor[0];
360 x2 = thread->scissor[0] + thread->scissor[2];
361 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
362 y2 = dpsoftrast.fb_height - thread->scissor[1];
363 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
365 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
367 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
368 thread->fb_clearscissor[0] = x1;
369 thread->fb_clearscissor[1] = y1;
370 thread->fb_clearscissor[2] = x2 - x1;
371 thread->fb_clearscissor[3] = y2 - y1;
373 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
376 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
378 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
381 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
383 if (thread->blendsubtract)
385 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
387 #define BLENDFUNC(sfactor, dfactor, blendmode) \
388 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
389 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
390 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
395 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
397 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
398 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
399 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
400 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
401 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
402 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
403 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
404 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
405 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
406 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
407 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
412 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
414 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
416 mask &= thread->validate;
419 if (mask & DPSOFTRAST_VALIDATE_FB)
421 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
422 DPSOFTRAST_RecalcFB(thread);
424 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
426 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
427 DPSOFTRAST_RecalcDepthFunc(thread);
429 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
431 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
432 DPSOFTRAST_RecalcBlendFunc(thread);
436 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
438 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
439 return &dpsoftrast.texture[index];
443 static void DPSOFTRAST_Texture_Grow(void)
445 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
446 DPSOFTRAST_State_Thread *thread;
450 // expand texture array as needed
451 if (dpsoftrast.texture_max < 1024)
452 dpsoftrast.texture_max = 1024;
454 dpsoftrast.texture_max *= 2;
455 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
456 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
457 if (dpsoftrast.texbound[i])
458 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
459 for (j = 0; j < dpsoftrast.numthreads; j++)
461 thread = &dpsoftrast.threads[j];
462 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
463 if (thread->texbound[i])
464 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
468 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
477 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
478 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
479 DPSOFTRAST_Texture *texture;
480 if (width*height*depth < 1)
482 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
485 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
487 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
492 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
493 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
494 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
496 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
497 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
499 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
504 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
507 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
509 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
514 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
516 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
519 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
521 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
524 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
526 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
529 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
531 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
534 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
536 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
539 // find first empty slot in texture array
540 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
541 if (!dpsoftrast.texture[texnum].bytes)
543 dpsoftrast.texture_firstfree = texnum + 1;
544 if (dpsoftrast.texture_max <= texnum)
545 DPSOFTRAST_Texture_Grow();
546 if (dpsoftrast.texture_end <= texnum)
547 dpsoftrast.texture_end = texnum + 1;
548 texture = &dpsoftrast.texture[texnum];
549 memset(texture, 0, sizeof(*texture));
550 texture->flags = flags;
551 texture->width = width;
552 texture->height = height;
553 texture->depth = depth;
554 texture->sides = sides;
566 s = w * h * d * sides * 4;
567 texture->mipmap[mipmaps][0] = size;
568 texture->mipmap[mipmaps][1] = s;
569 texture->mipmap[mipmaps][2] = w;
570 texture->mipmap[mipmaps][3] = h;
571 texture->mipmap[mipmaps][4] = d;
574 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
580 texture->mipmaps = mipmaps;
581 texture->size = size;
583 // allocate the pixels now
584 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
588 void DPSOFTRAST_Texture_Free(int index)
590 DPSOFTRAST_Texture *texture;
591 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
595 MM_FREE(texture->bytes);
596 texture->bytes = NULL;
597 memset(texture, 0, sizeof(*texture));
598 // adjust the free range and used range
599 if (dpsoftrast.texture_firstfree > index)
600 dpsoftrast.texture_firstfree = index;
601 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
602 dpsoftrast.texture_end--;
604 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
606 int i, x, y, z, w, layer0, layer1, row0, row1;
607 unsigned char *o, *i0, *i1, *i2, *i3;
608 DPSOFTRAST_Texture *texture;
609 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
610 if (texture->mipmaps <= 1)
612 for (i = 1;i < texture->mipmaps;i++)
614 for (z = 0;z < texture->mipmap[i][4];z++)
618 if (layer1 >= texture->mipmap[i-1][4])
619 layer1 = texture->mipmap[i-1][4]-1;
620 for (y = 0;y < texture->mipmap[i][3];y++)
624 if (row1 >= texture->mipmap[i-1][3])
625 row1 = texture->mipmap[i-1][3]-1;
626 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
627 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
628 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
629 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
630 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
631 w = texture->mipmap[i][2];
634 if (texture->mipmap[i-1][2] > 1)
636 // average 3D texture
637 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
639 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
640 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
641 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
642 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
647 // average 3D mipmap with parent width == 1
648 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
650 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
651 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
652 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
653 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
659 if (texture->mipmap[i-1][2] > 1)
661 // average 2D texture (common case)
662 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
664 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
665 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
666 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
667 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
672 // 2D texture with parent width == 1
673 o[0] = (i0[0] + i1[0] + 1) >> 1;
674 o[1] = (i0[1] + i1[1] + 1) >> 1;
675 o[2] = (i0[2] + i1[2] + 1) >> 1;
676 o[3] = (i0[3] + i1[3] + 1) >> 1;
683 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
685 DPSOFTRAST_Texture *texture;
687 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
690 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
691 while (blockheight > 0)
693 memcpy(dst, pixels, blockwidth * 4);
694 pixels += blockwidth * 4;
695 dst += texture->mipmap[0][2] * 4;
698 DPSOFTRAST_Texture_CalculateMipmaps(index);
700 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
702 DPSOFTRAST_Texture *texture;
703 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
706 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
707 DPSOFTRAST_Texture_CalculateMipmaps(index);
709 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
711 DPSOFTRAST_Texture *texture;
712 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
713 return texture->mipmap[mip][2];
715 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
717 DPSOFTRAST_Texture *texture;
718 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
719 return texture->mipmap[mip][3];
721 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
723 DPSOFTRAST_Texture *texture;
724 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
725 return texture->mipmap[mip][4];
727 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
729 DPSOFTRAST_Texture *texture;
730 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
733 return texture->bytes + texture->mipmap[mip][0];
735 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
737 DPSOFTRAST_Texture *texture;
738 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
739 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
741 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
746 texture->filter = filter;
749 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
751 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
752 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
753 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
755 dpsoftrast.fb_width = width;
756 dpsoftrast.fb_height = height;
757 dpsoftrast.fb_depthpixels = depthpixels;
758 dpsoftrast.fb_colorpixels[0] = colorpixels0;
759 dpsoftrast.fb_colorpixels[1] = colorpixels1;
760 dpsoftrast.fb_colorpixels[2] = colorpixels2;
761 dpsoftrast.fb_colorpixels[3] = colorpixels3;
764 static void DPSOFTRAST_Draw_FlushThreads(void);
766 static void DPSOFTRAST_Draw_SyncCommands(void)
769 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
772 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
775 DPSOFTRAST_State_Thread *thread;
777 int freecommand = dpsoftrast.commandpool.freecommand;
778 int usedcommands = dpsoftrast.commandpool.usedcommands;
779 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
781 DPSOFTRAST_Draw_SyncCommands();
787 for (i = 0; i < dpsoftrast.numthreads; i++)
789 thread = &dpsoftrast.threads[i];
790 commandoffset = freecommand - thread->commandoffset;
791 if (commandoffset < 0)
792 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
793 if (commandoffset > usedcommands)
796 usedcommands = commandoffset;
799 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
801 thread = &dpsoftrast.threads[waitindex];
802 SDL_LockMutex(thread->drawmutex);
803 if (thread->commandoffset != dpsoftrast.drawcommand)
805 thread->waiting = true;
806 if (thread->starving) SDL_CondSignal(thread->drawcond);
807 SDL_CondWait(thread->waitcond, thread->drawmutex);
808 thread->waiting = false;
810 SDL_UnlockMutex(thread->drawmutex);
812 dpsoftrast.commandpool.usedcommands = usedcommands;
814 DPSOFTRAST_Draw_FlushThreads();
818 #define DPSOFTRAST_ALIGNCOMMAND(size) \
819 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
820 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
821 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
823 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
825 DPSOFTRAST_Command *command;
826 int freecommand = dpsoftrast.commandpool.freecommand;
827 int usedcommands = dpsoftrast.commandpool.usedcommands;
828 int extra = sizeof(DPSOFTRAST_Command);
829 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
830 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
831 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
833 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
834 freecommand = dpsoftrast.commandpool.freecommand;
835 usedcommands = dpsoftrast.commandpool.usedcommands;
837 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
839 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
840 command->opcode = DPSOFTRAST_OPCODE_Reset;
841 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
844 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
845 command->opcode = opcode;
846 command->commandsize = size;
848 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
850 dpsoftrast.commandpool.freecommand = freecommand;
851 dpsoftrast.commandpool.usedcommands = usedcommands + size;
855 static void DPSOFTRAST_UndoCommand(int size)
857 int freecommand = dpsoftrast.commandpool.freecommand;
858 int usedcommands = dpsoftrast.commandpool.usedcommands;
861 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
862 usedcommands -= size;
863 dpsoftrast.commandpool.freecommand = freecommand;
864 dpsoftrast.commandpool.usedcommands = usedcommands;
867 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
868 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
870 thread->viewport[0] = command->x;
871 thread->viewport[1] = command->y;
872 thread->viewport[2] = command->width;
873 thread->viewport[3] = command->height;
874 thread->validate |= DPSOFTRAST_VALIDATE_FB;
876 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
878 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
881 command->width = width;
882 command->height = height;
884 dpsoftrast.viewport[0] = x;
885 dpsoftrast.viewport[1] = y;
886 dpsoftrast.viewport[2] = width;
887 dpsoftrast.viewport[3] = height;
888 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
891 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
892 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
894 int i, x1, y1, x2, y2, w, h, x, y;
895 int miny1 = thread->miny1;
896 int maxy1 = thread->maxy1;
897 int miny2 = thread->miny2;
898 int maxy2 = thread->maxy2;
902 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
903 x1 = thread->fb_clearscissor[0];
904 y1 = thread->fb_clearscissor[1];
905 x2 = thread->fb_clearscissor[0] + thread->fb_clearscissor[2];
906 y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
907 if (y1 < miny1) y1 = miny1;
908 if (y2 > maxy2) y2 = maxy2;
913 // FIXME: honor fb_colormask?
914 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
915 for (i = 0;i < 4;i++)
917 if (!dpsoftrast.fb_colorpixels[i])
919 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
922 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
923 for (x = x1;x < x2;x++)
928 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
930 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
937 DEFCOMMAND(3, ClearDepth, float depth;)
938 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
940 int x1, y1, x2, y2, w, h, x, y;
941 int miny1 = thread->miny1;
942 int maxy1 = thread->maxy1;
943 int miny2 = thread->miny2;
944 int maxy2 = thread->maxy2;
948 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
949 x1 = thread->fb_clearscissor[0];
950 y1 = thread->fb_clearscissor[1];
951 x2 = thread->fb_clearscissor[0] + thread->fb_clearscissor[2];
952 y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
953 if (y1 < miny1) y1 = miny1;
954 if (y2 > maxy2) y2 = maxy2;
959 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
960 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
963 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
964 for (x = x1;x < x2;x++)
968 void DPSOFTRAST_ClearDepth(float d)
970 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
974 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
975 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
977 thread->colormask[0] = command->r != 0;
978 thread->colormask[1] = command->g != 0;
979 thread->colormask[2] = command->b != 0;
980 thread->colormask[3] = command->a != 0;
981 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
983 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
985 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
992 DEFCOMMAND(5, DepthTest, int enable;)
993 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
995 thread->depthtest = command->enable;
996 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
998 void DPSOFTRAST_DepthTest(int enable)
1000 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1001 command->enable = enable;
1004 DEFCOMMAND(6, ScissorTest, int enable;)
1005 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1007 thread->scissortest = command->enable;
1008 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1010 void DPSOFTRAST_ScissorTest(int enable)
1012 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1013 command->enable = enable;
1016 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1017 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1019 thread->scissor[0] = command->x;
1020 thread->scissor[1] = command->y;
1021 thread->scissor[2] = command->width;
1022 thread->scissor[3] = command->height;
1023 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1025 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1027 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1030 command->width = width;
1031 command->height = height;
1034 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1035 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1037 thread->blendfunc[0] = command->sfactor;
1038 thread->blendfunc[1] = command->dfactor;
1039 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1041 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1043 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1044 command->sfactor = sfactor;
1045 command->dfactor = dfactor;
1048 DEFCOMMAND(9, BlendSubtract, int enable;)
1049 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1051 thread->blendsubtract = command->enable;
1052 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1054 void DPSOFTRAST_BlendSubtract(int enable)
1056 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1057 command->enable = enable;
1060 DEFCOMMAND(10, DepthMask, int enable;)
1061 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1063 thread->depthmask = command->enable;
1065 void DPSOFTRAST_DepthMask(int enable)
1067 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1068 command->enable = enable;
1071 DEFCOMMAND(11, DepthFunc, int func;)
1072 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1074 thread->depthfunc = command->func;
1076 void DPSOFTRAST_DepthFunc(int func)
1078 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1079 command->func = func;
1082 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1083 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1085 thread->depthrange[0] = command->nearval;
1086 thread->depthrange[1] = command->farval;
1088 void DPSOFTRAST_DepthRange(float nearval, float farval)
1090 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1091 command->nearval = nearval;
1092 command->farval = farval;
1095 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1096 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1098 thread->polygonoffset[0] = command->alongnormal;
1099 thread->polygonoffset[1] = command->intoview;
1101 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1103 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1104 command->alongnormal = alongnormal;
1105 command->intoview = intoview;
1108 DEFCOMMAND(14, CullFace, int mode;)
1109 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1111 thread->cullface = command->mode;
1113 void DPSOFTRAST_CullFace(int mode)
1115 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1116 command->mode = mode;
1119 DEFCOMMAND(15, AlphaTest, int enable;)
1120 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1122 thread->alphatest = command->enable;
1124 void DPSOFTRAST_AlphaTest(int enable)
1126 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1127 command->enable = enable;
1130 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1131 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1133 thread->alphafunc = command->func;
1134 thread->alphavalue = command->ref;
1136 void DPSOFTRAST_AlphaFunc(int func, float ref)
1138 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1139 command->func = func;
1143 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1145 dpsoftrast.color[0] = r;
1146 dpsoftrast.color[1] = g;
1147 dpsoftrast.color[2] = b;
1148 dpsoftrast.color[3] = a;
1151 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1153 int outstride = blockwidth * 4;
1154 int instride = dpsoftrast.fb_width * 4;
1157 int bx2 = blockx + blockwidth;
1158 int by2 = blocky + blockheight;
1163 unsigned char *inpixels;
1167 if (bx1 < 0) bx1 = 0;
1168 if (by1 < 0) by1 = 0;
1169 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1170 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1173 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1174 if (dpsoftrast.bigendian)
1176 for (y = by1;y < by2;y++)
1178 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1179 o = (unsigned char *)outpixels + (y - by1) * outstride;
1180 for (x = bx1;x < bx2;x++)
1193 for (y = by1;y < by2;y++)
1195 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1196 o = (unsigned char *)outpixels + (y - by1) * outstride;
1202 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1206 int tx2 = tx + width;
1207 int ty2 = ty + height;
1210 int sx2 = sx + width;
1211 int sy2 = sy + height;
1221 unsigned int *spixels;
1222 unsigned int *tpixels;
1223 DPSOFTRAST_Texture *texture;
1224 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1225 if (mip < 0 || mip >= texture->mipmaps) return;
1228 spixels = dpsoftrast.fb_colorpixels[0];
1229 swidth = dpsoftrast.fb_width;
1230 sheight = dpsoftrast.fb_height;
1231 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1232 twidth = texture->mipmap[mip][2];
1233 theight = texture->mipmap[mip][3];
1234 if (tx1 < 0) tx1 = 0;
1235 if (ty1 < 0) ty1 = 0;
1236 if (tx2 > twidth) tx2 = twidth;
1237 if (ty2 > theight) ty2 = theight;
1238 if (sx1 < 0) sx1 = 0;
1239 if (sy1 < 0) sy1 = 0;
1240 if (sx2 > swidth) sx2 = swidth;
1241 if (sy2 > sheight) sy2 = sheight;
1246 if (tw > sw) tw = sw;
1247 if (th > sh) th = sh;
1248 if (tw < 1 || th < 1)
1250 for (y = 0;y < th;y++)
1251 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1252 if (texture->mipmaps > 1)
1253 DPSOFTRAST_Texture_CalculateMipmaps(index);
1256 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1257 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1259 if (thread->texbound[command->unitnum])
1260 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1261 thread->texbound[command->unitnum] = command->texture;
1263 void DPSOFTRAST_SetTexture(int unitnum, int index)
1265 DPSOFTRAST_Command_SetTexture *command;
1266 DPSOFTRAST_Texture *texture;
1267 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1269 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1272 texture = DPSOFTRAST_Texture_GetByIndex(index);
1273 if (index && !texture)
1275 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1279 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1280 command->unitnum = unitnum;
1281 command->texture = texture;
1283 dpsoftrast.texbound[unitnum] = texture;
1284 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1287 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1289 dpsoftrast.pointer_vertex3f = vertex3f;
1290 dpsoftrast.stride_vertex = stride;
1292 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1294 dpsoftrast.pointer_color4f = color4f;
1295 dpsoftrast.pointer_color4ub = NULL;
1296 dpsoftrast.stride_color = stride;
1298 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1300 dpsoftrast.pointer_color4f = NULL;
1301 dpsoftrast.pointer_color4ub = color4ub;
1302 dpsoftrast.stride_color = stride;
1304 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1306 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1307 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1308 dpsoftrast.stride_texcoord[unitnum] = stride;
1311 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1312 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1314 thread->shader_mode = command->mode;
1315 thread->shader_permutation = command->permutation;
1317 void DPSOFTRAST_SetShader(int mode, int permutation)
1319 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1320 command->mode = mode;
1321 command->permutation = permutation;
1323 dpsoftrast.shader_mode = mode;
1324 dpsoftrast.shader_permutation = permutation;
1327 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1328 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1330 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1332 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1334 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1335 command->index = index;
1336 command->val[0] = v0;
1337 command->val[1] = v1;
1338 command->val[2] = v2;
1339 command->val[3] = v3;
1341 dpsoftrast.uniform4f[index*4+0] = v0;
1342 dpsoftrast.uniform4f[index*4+1] = v1;
1343 dpsoftrast.uniform4f[index*4+2] = v2;
1344 dpsoftrast.uniform4f[index*4+3] = v3;
1346 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1348 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1349 command->index = index;
1350 memcpy(command->val, v, sizeof(command->val));
1352 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1355 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1356 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1358 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1360 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1364 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1366 __m128 m0, m1, m2, m3;
1367 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1368 command->index = index;
1369 if (((size_t)v)&(ALIGN_SIZE-1))
1371 m0 = _mm_loadu_ps(v);
1372 m1 = _mm_loadu_ps(v+4);
1373 m2 = _mm_loadu_ps(v+8);
1374 m3 = _mm_loadu_ps(v+12);
1378 m0 = _mm_load_ps(v);
1379 m1 = _mm_load_ps(v+4);
1380 m2 = _mm_load_ps(v+8);
1381 m3 = _mm_load_ps(v+12);
1385 __m128 t0, t1, t2, t3;
1386 t0 = _mm_unpacklo_ps(m0, m1);
1387 t1 = _mm_unpacklo_ps(m2, m3);
1388 t2 = _mm_unpackhi_ps(m0, m1);
1389 t3 = _mm_unpackhi_ps(m2, m3);
1390 m0 = _mm_movelh_ps(t0, t1);
1391 m1 = _mm_movehl_ps(t1, t0);
1392 m2 = _mm_movelh_ps(t2, t3);
1393 m3 = _mm_movehl_ps(t3, t2);
1395 _mm_store_ps(command->val, m0);
1396 _mm_store_ps(command->val+4, m1);
1397 _mm_store_ps(command->val+8, m2);
1398 _mm_store_ps(command->val+12, m3);
1399 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1400 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1401 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1402 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1407 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1408 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1410 thread->uniform1i[command->index] = command->val;
1412 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1414 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1415 command->index = index;
1418 dpsoftrast.uniform1i[command->index] = i0;
1422 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1424 float *end = dst + size*4;
1425 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1429 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1438 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1445 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1447 float *end = dst + size*4;
1448 if (stride == sizeof(float[3]))
1450 float *end4 = dst + (size&~3)*4;
1451 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1455 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1456 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1457 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1458 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1459 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1460 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1461 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1462 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1463 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1464 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1465 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1466 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1467 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1469 src += 4*sizeof(float[3]);
1476 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1477 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1478 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1479 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1480 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1481 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1482 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1483 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1484 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1485 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1486 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1487 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1488 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1490 src += 4*sizeof(float[3]);
1494 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1498 __m128 v = _mm_loadu_ps((const float *)src);
1499 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1500 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1501 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1502 _mm_store_ps(dst, v);
1511 __m128 v = _mm_load_ps((const float *)src);
1512 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1513 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1514 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1515 _mm_store_ps(dst, v);
1522 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1524 float *end = dst + size*4;
1525 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1526 if (stride == sizeof(float[2]))
1528 float *end2 = dst + (size&~1)*4;
1529 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1533 __m128 v = _mm_loadu_ps((const float *)src);
1534 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1535 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1537 src += 2*sizeof(float[2]);
1544 __m128 v = _mm_load_ps((const float *)src);
1545 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1546 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1548 src += 2*sizeof(float[2]);
1554 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1560 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1562 float *end = dst + size*4;
1563 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1564 if (stride == sizeof(unsigned char[4]))
1566 float *end4 = dst + (size&~3)*4;
1567 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1571 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1572 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1573 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1574 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1575 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1577 src += 4*sizeof(unsigned char[4]);
1584 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1585 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1586 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1587 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1588 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1590 src += 4*sizeof(unsigned char[4]);
1596 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1597 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1603 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1605 float *end = dst + 4*size;
1606 __m128 v = _mm_loadu_ps(src);
1609 _mm_store_ps(dst, v);
1615 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1618 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1619 __m128 m0, m1, m2, m3;
1621 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1623 // fast case for identity matrix
1624 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1627 end = out4f + numitems*4;
1628 m0 = _mm_loadu_ps(inmatrix16f);
1629 m1 = _mm_loadu_ps(inmatrix16f + 4);
1630 m2 = _mm_loadu_ps(inmatrix16f + 8);
1631 m3 = _mm_loadu_ps(inmatrix16f + 12);
1632 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1636 __m128 v = _mm_loadu_ps(in4f);
1638 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1639 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1640 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1641 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1650 __m128 v = _mm_load_ps(in4f);
1652 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1653 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1654 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1655 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1663 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1665 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1669 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1671 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1672 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1673 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1674 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1677 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1679 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1680 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1681 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1682 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1685 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1688 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1689 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1690 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1691 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1694 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1696 int clipmask = 0xFF;
1697 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1698 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1699 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1700 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1701 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1702 #define BBFRONT(k, pos) \
1704 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1705 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1706 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1709 clipmask &= ~(1<<k); \
1710 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1711 minproj = _mm_min_ss(minproj, proj); \
1712 maxproj = _mm_max_ss(maxproj, proj); \
1716 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1717 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1718 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1719 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1720 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1721 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1725 if (clipmask&(1<<k)) \
1727 if (!(clipmask&(1<<(k^1)))) \
1729 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1730 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1731 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1732 minproj = _mm_min_ss(minproj, proj); \
1733 maxproj = _mm_max_ss(maxproj, proj); \
1735 if (!(clipmask&(1<<(k^2)))) \
1737 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1738 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1739 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1740 minproj = _mm_min_ss(minproj, proj); \
1741 maxproj = _mm_max_ss(maxproj, proj); \
1743 if (!(clipmask&(1<<(k^4)))) \
1745 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1746 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1747 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1748 minproj = _mm_min_ss(minproj, proj); \
1749 maxproj = _mm_max_ss(maxproj, proj); \
1753 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1754 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1755 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1756 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1757 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1758 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1759 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1760 *starty = _mm_cvttss_si32(maxproj);
1761 *endy = _mm_cvttss_si32(minproj)+1;
1766 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1769 float *end = out4f + numitems*4;
1770 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1771 __m128 minpos, maxpos;
1772 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1774 minpos = maxpos = _mm_loadu_ps(in4f);
1777 __m128 v = _mm_loadu_ps(in4f);
1778 minpos = _mm_min_ps(minpos, v);
1779 maxpos = _mm_max_ps(maxpos, v);
1780 _mm_store_ps(out4f, v);
1781 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1782 _mm_store_ps(screen4f, v);
1790 minpos = maxpos = _mm_load_ps(in4f);
1793 __m128 v = _mm_load_ps(in4f);
1794 minpos = _mm_min_ps(minpos, v);
1795 maxpos = _mm_max_ps(maxpos, v);
1796 _mm_store_ps(out4f, v);
1797 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1798 _mm_store_ps(screen4f, v);
1805 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale,
1806 _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1807 _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1808 _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1809 _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1814 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1817 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1818 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1820 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1821 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1822 end = out4f + numitems*4;
1823 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1824 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1825 m0 = _mm_loadu_ps(inmatrix16f);
1826 m1 = _mm_loadu_ps(inmatrix16f + 4);
1827 m2 = _mm_loadu_ps(inmatrix16f + 8);
1828 m3 = _mm_loadu_ps(inmatrix16f + 12);
1829 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1831 minpos = maxpos = _mm_loadu_ps(in4f);
1834 __m128 v = _mm_loadu_ps(in4f);
1835 minpos = _mm_min_ps(minpos, v);
1836 maxpos = _mm_max_ps(maxpos, v);
1837 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1838 _mm_store_ps(out4f, v);
1839 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1840 _mm_store_ps(screen4f, v);
1848 minpos = maxpos = _mm_load_ps(in4f);
1851 __m128 v = _mm_load_ps(in4f);
1852 minpos = _mm_min_ps(minpos, v);
1853 maxpos = _mm_max_ps(maxpos, v);
1854 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1855 _mm_store_ps(out4f, v);
1856 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1857 _mm_store_ps(screen4f, v);
1864 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3);
1869 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1871 float *outf = dpsoftrast.post_array4f[outarray];
1872 const unsigned char *inb;
1873 int firstvertex = dpsoftrast.firstvertex;
1874 int numvertices = dpsoftrast.numvertices;
1878 case DPSOFTRAST_ARRAY_POSITION:
1879 stride = dpsoftrast.stride_vertex;
1880 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1881 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1883 case DPSOFTRAST_ARRAY_COLOR:
1884 stride = dpsoftrast.stride_color;
1885 if (dpsoftrast.pointer_color4f)
1887 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1888 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1890 else if (dpsoftrast.pointer_color4ub)
1892 stride = dpsoftrast.stride_color;
1893 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1894 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1898 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1902 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1903 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1905 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1906 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1909 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1912 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1915 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1924 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1926 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1927 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1932 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1934 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1935 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1940 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1942 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1943 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1947 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1950 int startx = span->startx;
1951 int endx = span->endx;
1952 float wslope = triangle->w[0];
1953 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1954 float endz = 1.0f / (w + wslope * startx);
1955 for (x = startx;x < endx;)
1957 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1959 if (nextsub >= endx) nextsub = endsub = endx-1;
1960 endz = 1.0f / (w + wslope * nextsub);
1961 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1962 for (; x <= endsub; x++, z += dz)
1967 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1970 int startx = span->startx;
1971 int endx = span->endx;
1974 unsigned char * RESTRICT pixelmask = span->pixelmask;
1975 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1978 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1979 // handle alphatest now (this affects depth writes too)
1980 if (thread->alphatest)
1981 for (x = startx;x < endx;x++)
1982 if (in4f[x*4+3] < 0.5f)
1983 pixelmask[x] = false;
1984 // FIXME: this does not handle bigendian
1985 switch(thread->fb_blendmode)
1987 case DPSOFTRAST_BLENDMODE_OPAQUE:
1988 for (x = startx;x < endx;x++)
1992 d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
1993 d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
1994 d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
1995 d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
1996 pixel[x*4+0] = d[0];
1997 pixel[x*4+1] = d[1];
1998 pixel[x*4+2] = d[2];
1999 pixel[x*4+3] = d[3];
2002 case DPSOFTRAST_BLENDMODE_ALPHA:
2003 for (x = startx;x < endx;x++)
2007 a = in4f[x*4+3] * 255.0f;
2008 b = 1.0f - in4f[x*4+3];
2009 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2010 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2011 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2012 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2013 pixel[x*4+0] = d[0];
2014 pixel[x*4+1] = d[1];
2015 pixel[x*4+2] = d[2];
2016 pixel[x*4+3] = d[3];
2019 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2020 for (x = startx;x < endx;x++)
2024 a = in4f[x*4+3] * 255.0f;
2025 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2026 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2027 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2028 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2029 pixel[x*4+0] = d[0];
2030 pixel[x*4+1] = d[1];
2031 pixel[x*4+2] = d[2];
2032 pixel[x*4+3] = d[3];
2035 case DPSOFTRAST_BLENDMODE_ADD:
2036 for (x = startx;x < endx;x++)
2040 d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2041 d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2042 d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2043 d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2044 pixel[x*4+0] = d[0];
2045 pixel[x*4+1] = d[1];
2046 pixel[x*4+2] = d[2];
2047 pixel[x*4+3] = d[3];
2050 case DPSOFTRAST_BLENDMODE_INVMOD:
2051 for (x = startx;x < endx;x++)
2055 d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2056 d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2057 d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2058 d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2059 pixel[x*4+0] = d[0];
2060 pixel[x*4+1] = d[1];
2061 pixel[x*4+2] = d[2];
2062 pixel[x*4+3] = d[3];
2065 case DPSOFTRAST_BLENDMODE_MUL:
2066 for (x = startx;x < endx;x++)
2070 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2071 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2072 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2073 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2074 pixel[x*4+0] = d[0];
2075 pixel[x*4+1] = d[1];
2076 pixel[x*4+2] = d[2];
2077 pixel[x*4+3] = d[3];
2080 case DPSOFTRAST_BLENDMODE_MUL2:
2081 for (x = startx;x < endx;x++)
2085 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2086 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2087 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2088 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2089 pixel[x*4+0] = d[0];
2090 pixel[x*4+1] = d[1];
2091 pixel[x*4+2] = d[2];
2092 pixel[x*4+3] = d[3];
2095 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2096 for (x = startx;x < endx;x++)
2100 a = in4f[x*4+3] * -255.0f;
2101 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2102 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2103 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2104 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2105 pixel[x*4+0] = d[0];
2106 pixel[x*4+1] = d[1];
2107 pixel[x*4+2] = d[2];
2108 pixel[x*4+3] = d[3];
2111 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2112 for (x = startx;x < endx;x++)
2117 b = 1.0f - in4f[x*4+3];
2118 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2119 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2120 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2121 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2122 pixel[x*4+0] = d[0];
2123 pixel[x*4+1] = d[1];
2124 pixel[x*4+2] = d[2];
2125 pixel[x*4+3] = d[3];
2131 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2135 int startx = span->startx;
2136 int endx = span->endx;
2137 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2138 unsigned char * RESTRICT pixelmask = span->pixelmask;
2139 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2140 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2143 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2144 pixeli += span->y * dpsoftrast.fb_width + span->x;
2145 // handle alphatest now (this affects depth writes too)
2146 if (thread->alphatest)
2147 for (x = startx;x < endx;x++)
2148 if (in4ub[x*4+3] < 0.5f)
2149 pixelmask[x] = false;
2150 // FIXME: this does not handle bigendian
2151 switch(thread->fb_blendmode)
2153 case DPSOFTRAST_BLENDMODE_OPAQUE:
2154 for (x = startx;x + 4 <= endx;)
2156 if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2158 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2172 case DPSOFTRAST_BLENDMODE_ALPHA:
2173 #define FINISHBLEND(blend2, blend1) \
2174 for (x = startx;x + 2 <= endx;x += 2) \
2177 switch (*(const unsigned short*)&pixelmask[x]) \
2180 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2181 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2183 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2186 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2187 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2189 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2192 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2193 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2195 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2200 for(;x < endx; x++) \
2203 if (!pixelmask[x]) \
2205 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2206 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2208 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2212 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2213 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2215 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2216 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2219 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2221 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2222 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2224 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2225 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2228 case DPSOFTRAST_BLENDMODE_ADD:
2229 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2231 case DPSOFTRAST_BLENDMODE_INVMOD:
2233 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2235 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2238 case DPSOFTRAST_BLENDMODE_MUL:
2239 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2241 case DPSOFTRAST_BLENDMODE_MUL2:
2242 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2244 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2246 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2247 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2249 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2250 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2253 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2255 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2256 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2258 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2259 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2266 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2269 int startx = span->startx;
2270 int endx = span->endx;
2275 float tc[2], endtc[2];
2277 unsigned int tci[2];
2278 unsigned int tci1[2];
2279 unsigned int tcimin[2];
2280 unsigned int tcimax[2];
2285 const unsigned char * RESTRICT pixelbase;
2286 const unsigned char * RESTRICT pixel[4];
2287 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2288 // if no texture is bound, just fill it with white
2291 for (x = startx;x < endx;x++)
2293 out4f[x*4+0] = 1.0f;
2294 out4f[x*4+1] = 1.0f;
2295 out4f[x*4+2] = 1.0f;
2296 out4f[x*4+3] = 1.0f;
2300 mip = triangle->mip[texunitindex];
2301 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2302 // if this mipmap of the texture is 1 pixel, just fill it with that color
2303 if (texture->mipmap[mip][1] == 4)
2305 c[0] = texture->bytes[2] * (1.0f/255.0f);
2306 c[1] = texture->bytes[1] * (1.0f/255.0f);
2307 c[2] = texture->bytes[0] * (1.0f/255.0f);
2308 c[3] = texture->bytes[3] * (1.0f/255.0f);
2309 for (x = startx;x < endx;x++)
2311 out4f[x*4+0] = c[0];
2312 out4f[x*4+1] = c[1];
2313 out4f[x*4+2] = c[2];
2314 out4f[x*4+3] = c[3];
2318 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2319 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2320 flags = texture->flags;
2321 tcscale[0] = texture->mipmap[mip][2];
2322 tcscale[1] = texture->mipmap[mip][3];
2323 tciwidth = texture->mipmap[mip][2];
2326 tcimax[0] = texture->mipmap[mip][2]-1;
2327 tcimax[1] = texture->mipmap[mip][3]-1;
2328 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2329 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2330 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2331 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2332 for (x = startx;x < endx;)
2334 unsigned int subtc[2];
2335 unsigned int substep[2];
2336 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2337 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2338 if (nextsub >= endx)
2340 nextsub = endsub = endx-1;
2341 if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2345 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2346 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2347 substep[0] = (endtc[0] - tc[0]) * subscale;
2348 substep[1] = (endtc[1] - tc[1]) * subscale;
2349 subtc[0] = tc[0] * (1<<16);
2350 subtc[1] = tc[1] * (1<<16);
2353 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2355 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2357 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2358 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2359 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2360 tci[0] = subtc[0]>>16;
2361 tci[1] = subtc[1]>>16;
2362 tci1[0] = tci[0] + 1;
2363 tci1[1] = tci[1] + 1;
2364 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2365 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2366 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2367 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2368 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2369 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2370 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2371 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2372 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2373 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2374 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2375 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2376 out4f[x*4+0] = c[0];
2377 out4f[x*4+1] = c[1];
2378 out4f[x*4+2] = c[2];
2379 out4f[x*4+3] = c[3];
2384 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2386 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2387 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2388 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2389 tci[0] = subtc[0]>>16;
2390 tci[1] = subtc[1]>>16;
2391 tci1[0] = tci[0] + 1;
2392 tci1[1] = tci[1] + 1;
2393 tci[0] &= tciwrapmask[0];
2394 tci[1] &= tciwrapmask[1];
2395 tci1[0] &= tciwrapmask[0];
2396 tci1[1] &= tciwrapmask[1];
2397 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2398 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2399 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2400 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2401 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2402 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2403 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2404 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2405 out4f[x*4+0] = c[0];
2406 out4f[x*4+1] = c[1];
2407 out4f[x*4+2] = c[2];
2408 out4f[x*4+3] = c[3];
2412 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2414 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2416 tci[0] = subtc[0]>>16;
2417 tci[1] = subtc[1]>>16;
2418 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2419 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2420 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2421 c[0] = pixel[0][2] * (1.0f / 255.0f);
2422 c[1] = pixel[0][1] * (1.0f / 255.0f);
2423 c[2] = pixel[0][0] * (1.0f / 255.0f);
2424 c[3] = pixel[0][3] * (1.0f / 255.0f);
2425 out4f[x*4+0] = c[0];
2426 out4f[x*4+1] = c[1];
2427 out4f[x*4+2] = c[2];
2428 out4f[x*4+3] = c[3];
2433 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2435 tci[0] = subtc[0]>>16;
2436 tci[1] = subtc[1]>>16;
2437 tci[0] &= tciwrapmask[0];
2438 tci[1] &= tciwrapmask[1];
2439 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2440 c[0] = pixel[0][2] * (1.0f / 255.0f);
2441 c[1] = pixel[0][1] * (1.0f / 255.0f);
2442 c[2] = pixel[0][0] * (1.0f / 255.0f);
2443 c[3] = pixel[0][3] * (1.0f / 255.0f);
2444 out4f[x*4+0] = c[0];
2445 out4f[x*4+1] = c[1];
2446 out4f[x*4+2] = c[2];
2447 out4f[x*4+3] = c[3];
2453 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2457 int startx = span->startx;
2458 int endx = span->endx;
2460 __m128 data, slope, tcscale;
2461 __m128i tcsize, tcmask, tcoffset, tcmax;
2463 __m128i subtc, substep, endsubtc;
2466 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2467 const unsigned char * RESTRICT pixelbase;
2468 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2469 // if no texture is bound, just fill it with white
2472 memset(out4ub + startx*4, 255, span->length*4);
2475 mip = triangle->mip[texunitindex];
2476 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2477 // if this mipmap of the texture is 1 pixel, just fill it with that color
2478 if (texture->mipmap[mip][1] == 4)
2480 unsigned int k = *((const unsigned int *)pixelbase);
2481 for (x = startx;x < endx;x++)
2485 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2486 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2487 flags = texture->flags;
2488 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2489 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2490 tcscale = _mm_cvtepi32_ps(tcsize);
2491 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2492 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2493 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2494 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2495 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2496 tcmax = _mm_packs_epi32(tcmask, tcmask);
2497 for (x = startx;x < endx;)
2499 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2500 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2501 if (nextsub >= endx)
2503 nextsub = endsub = endx-1;
2504 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2508 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2509 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2510 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2511 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2512 substep = _mm_slli_epi32(substep, 1);
2515 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2516 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2518 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2519 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2521 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2522 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2523 tci = _mm_madd_epi16(tci, tcoffset);
2524 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2525 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2526 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2527 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2528 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2529 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2530 fracm = _mm_srli_epi16(subtc, 1);
2531 pix1 = _mm_add_epi16(pix1,
2532 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2533 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2534 pix3 = _mm_add_epi16(pix3,
2535 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2536 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2537 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2538 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2539 pix2 = _mm_add_epi16(pix2,
2540 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2541 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2542 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2546 const unsigned char * RESTRICT ptr1;
2547 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2548 tci = _mm_madd_epi16(tci, tcoffset);
2549 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2550 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2551 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2552 fracm = _mm_srli_epi16(subtc, 1);
2553 pix1 = _mm_add_epi16(pix1,
2554 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2555 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2556 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2557 pix1 = _mm_add_epi16(pix1,
2558 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2559 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2560 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2564 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2566 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2568 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2569 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2570 tci = _mm_madd_epi16(tci, tcoffset);
2571 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2572 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2573 _mm_setzero_si128());
2574 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2575 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2576 _mm_setzero_si128());
2577 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2578 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2579 tci = _mm_madd_epi16(tci, tcoffset);
2580 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2581 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2582 _mm_setzero_si128());
2583 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2584 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2585 _mm_setzero_si128());
2586 fracm = _mm_srli_epi16(subtc, 1);
2587 pix1 = _mm_add_epi16(pix1,
2588 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2589 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2590 pix3 = _mm_add_epi16(pix3,
2591 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2592 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2593 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2594 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2595 pix2 = _mm_add_epi16(pix2,
2596 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2597 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2598 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2602 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2603 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2604 tci = _mm_madd_epi16(tci, tcoffset);
2605 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2606 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2607 _mm_setzero_si128());
2608 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2609 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2610 _mm_setzero_si128());
2611 fracm = _mm_srli_epi16(subtc, 1);
2612 pix1 = _mm_add_epi16(pix1,
2613 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2614 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2615 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2616 pix1 = _mm_add_epi16(pix1,
2617 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2618 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2619 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2625 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2627 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2628 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2629 tci = _mm_madd_epi16(tci, tcoffset);
2630 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2631 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2632 _mm_setzero_si128());
2633 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2634 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2635 _mm_setzero_si128());
2636 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2637 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2638 tci = _mm_madd_epi16(tci, tcoffset);
2639 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2640 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2641 _mm_setzero_si128());
2642 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2643 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2644 _mm_setzero_si128());
2645 fracm = _mm_srli_epi16(subtc, 1);
2646 pix1 = _mm_add_epi16(pix1,
2647 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2648 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2649 pix3 = _mm_add_epi16(pix3,
2650 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2651 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2652 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2653 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2654 pix2 = _mm_add_epi16(pix2,
2655 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2656 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2657 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2661 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2662 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2663 tci = _mm_madd_epi16(tci, tcoffset);
2664 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2665 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2666 _mm_setzero_si128());
2667 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2668 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2669 _mm_setzero_si128());
2670 fracm = _mm_srli_epi16(subtc, 1);
2671 pix1 = _mm_add_epi16(pix1,
2672 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2673 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2674 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2675 pix1 = _mm_add_epi16(pix1,
2676 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2677 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2678 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2685 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2687 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2689 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2690 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2691 tci = _mm_madd_epi16(tci, tcoffset);
2692 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2693 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2697 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2698 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2699 tci = _mm_madd_epi16(tci, tcoffset);
2700 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2706 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2708 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2709 tci = _mm_and_si128(tci, tcmax);
2710 tci = _mm_madd_epi16(tci, tcoffset);
2711 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2712 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2716 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2717 tci = _mm_and_si128(tci, tcmax);
2718 tci = _mm_madd_epi16(tci, tcoffset);
2719 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2728 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2731 memset(out4ub, 255, span->length*4);
2734 float DPSOFTRAST_SampleShadowmap(const float *vector)
2740 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2743 int startx = span->startx;
2744 int endx = span->endx;
2749 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2750 for (x = startx;x < endx;x++)
2753 c[0] = (data[0] + slope[0]*x) * z;
2754 c[1] = (data[1] + slope[1]*x) * z;
2755 c[2] = (data[2] + slope[2]*x) * z;
2756 c[3] = (data[3] + slope[3]*x) * z;
2757 out4f[x*4+0] = in4f[x*4+0] * c[0];
2758 out4f[x*4+1] = in4f[x*4+1] * c[1];
2759 out4f[x*4+2] = in4f[x*4+2] * c[2];
2760 out4f[x*4+3] = in4f[x*4+3] * c[3];
2764 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2767 int startx = span->startx;
2768 int endx = span->endx;
2773 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2774 for (x = startx;x < endx;x++)
2777 c[0] = (data[0] + slope[0]*x) * z;
2778 c[1] = (data[1] + slope[1]*x) * z;
2779 c[2] = (data[2] + slope[2]*x) * z;
2780 c[3] = (data[3] + slope[3]*x) * z;
2781 out4f[x*4+0] = c[0];
2782 out4f[x*4+1] = c[1];
2783 out4f[x*4+2] = c[2];
2784 out4f[x*4+3] = c[3];
2788 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2790 int x, startx = span->startx, endx = span->endx;
2791 float c[4], localcolor[4];
2792 localcolor[0] = subcolor[0];
2793 localcolor[1] = subcolor[1];
2794 localcolor[2] = subcolor[2];
2795 localcolor[3] = subcolor[3];
2796 for (x = startx;x < endx;x++)
2798 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2799 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2800 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2801 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2802 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2803 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2804 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2805 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2809 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2811 int x, startx = span->startx, endx = span->endx;
2812 for (x = startx;x < endx;x++)
2814 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2815 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2816 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2817 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2821 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2823 int x, startx = span->startx, endx = span->endx;
2824 for (x = startx;x < endx;x++)
2826 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2827 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2828 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2829 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2833 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2835 int x, startx = span->startx, endx = span->endx;
2837 for (x = startx;x < endx;x++)
2839 a = 1.0f - inb4f[x*4+3];
2841 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2842 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2843 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2844 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2848 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2850 int x, startx = span->startx, endx = span->endx;
2851 float localcolor[4], ilerp, lerp;
2852 localcolor[0] = color[0];
2853 localcolor[1] = color[1];
2854 localcolor[2] = color[2];
2855 localcolor[3] = color[3];
2856 ilerp = 1.0f - localcolor[3];
2857 lerp = localcolor[3];
2858 for (x = startx;x < endx;x++)
2860 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2861 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2862 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2863 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2869 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2873 int startx = span->startx;
2874 int endx = span->endx;
2877 __m128i submod, substep, endsubmod;
2878 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2879 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2880 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2881 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2882 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2883 for (x = startx; x < endx;)
2885 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2886 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2887 if (nextsub >= endx)
2889 nextsub = endsub = endx-1;
2890 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2894 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2895 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2896 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2897 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2898 substep = _mm_packs_epi32(substep, substep);
2899 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2901 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2902 pix = _mm_mulhi_epu16(pix, submod);
2903 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2907 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2908 pix = _mm_mulhi_epu16(pix, submod);
2909 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2916 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2920 int startx = span->startx;
2921 int endx = span->endx;
2924 __m128i submod, substep, endsubmod;
2925 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2926 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2927 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2928 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2929 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2930 for (x = startx; x < endx;)
2932 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2933 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2934 if (nextsub >= endx)
2936 nextsub = endsub = endx-1;
2937 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2941 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2942 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2943 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2944 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2945 substep = _mm_packs_epi32(substep, substep);
2946 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2948 __m128i pix = _mm_srai_epi16(submod, 4);
2949 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2953 __m128i pix = _mm_srai_epi16(submod, 4);
2954 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2961 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2964 int x, startx = span->startx, endx = span->endx;
2965 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2966 localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2967 for (x = startx;x+2 <= endx;x+=2)
2969 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2970 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2971 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2972 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2976 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2977 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2978 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2979 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2984 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2987 int x, startx = span->startx, endx = span->endx;
2988 for (x = startx;x+2 <= endx;x+=2)
2990 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2991 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
2992 pix1 = _mm_mulhi_epu16(pix1, pix2);
2993 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2997 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2998 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
2999 pix1 = _mm_mulhi_epu16(pix1, pix2);
3000 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3005 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3008 int x, startx = span->startx, endx = span->endx;
3009 for (x = startx;x+2 <= endx;x+=2)
3011 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3012 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3013 pix1 = _mm_add_epi16(pix1, pix2);
3014 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3018 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3019 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3020 pix1 = _mm_add_epi16(pix1, pix2);
3021 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3026 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3029 int x, startx = span->startx, endx = span->endx;
3030 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3031 tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
3032 for (x = startx;x+2 <= endx;x+=2)
3034 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3035 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3036 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3037 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3041 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3042 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3043 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3044 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3049 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3052 int x, startx = span->startx, endx = span->endx;
3053 for (x = startx;x+2 <= endx;x+=2)
3055 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3056 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3057 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3058 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3059 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3063 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3064 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3065 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3066 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3067 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3072 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3075 int x, startx = span->startx, endx = span->endx;
3076 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3077 localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
3078 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3079 for (x = startx;x+2 <= endx;x+=2)
3081 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3082 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3083 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3087 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3088 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3089 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3096 void DPSOFTRAST_VertexShader_Generic(void)
3098 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3099 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3100 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3101 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3102 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3105 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3107 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3108 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3109 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3110 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3111 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3112 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3114 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3115 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3116 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3118 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3119 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3122 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3124 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3127 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3129 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3132 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3137 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3138 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3143 void DPSOFTRAST_VertexShader_PostProcess(void)
3145 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3146 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3147 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3150 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3152 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3153 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3154 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3155 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3156 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3157 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3158 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3160 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3161 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3163 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3164 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3166 // TODO: implement saturation
3168 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3170 // TODO: implement gammaramps
3172 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3177 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3179 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3182 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3184 // this is never called (because colormask is off when this shader is used)
3185 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3186 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3187 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3188 memset(buffer_FragColorbgra8, 0, span->length*4);
3189 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3194 void DPSOFTRAST_VertexShader_FlatColor(void)
3196 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3197 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3200 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3202 int x, startx = span->startx, endx = span->endx;
3203 int Color_Ambienti[4];
3204 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3205 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3206 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3207 Color_Ambienti[2] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0]*256.0f);
3208 Color_Ambienti[1] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1]*256.0f);
3209 Color_Ambienti[0] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2]*256.0f);
3210 Color_Ambienti[3] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0] *256.0f);
3211 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3212 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3213 for (x = startx;x < endx;x++)
3215 buffer_FragColorbgra8[x*4+0] = (buffer_texture_colorbgra8[x*4+0] * Color_Ambienti[0])>>8;
3216 buffer_FragColorbgra8[x*4+1] = (buffer_texture_colorbgra8[x*4+1] * Color_Ambienti[1])>>8;
3217 buffer_FragColorbgra8[x*4+2] = (buffer_texture_colorbgra8[x*4+2] * Color_Ambienti[2])>>8;
3218 buffer_FragColorbgra8[x*4+3] = (buffer_texture_colorbgra8[x*4+3] * Color_Ambienti[3])>>8;
3220 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3225 void DPSOFTRAST_VertexShader_VertexColor(void)
3227 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3228 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3229 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3232 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3235 unsigned char * RESTRICT pixelmask = span->pixelmask;
3236 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3237 int x, startx = span->startx, endx = span->endx;
3238 __m128i Color_Ambientm, Color_Diffusem;
3240 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3241 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3242 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3243 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3244 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3245 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3246 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3247 pixel = buffer_FragColorbgra8;
3248 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3249 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3250 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3251 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3252 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3253 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3254 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3255 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3256 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3257 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3258 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3259 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3260 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3261 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3263 __m128i color, mod, pix;
3264 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3267 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3268 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3269 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3270 data = _mm_add_ps(data, slope);
3271 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3272 data = _mm_add_ps(data, slope);
3273 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3274 data = _mm_add_ps(data, slope);
3275 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3276 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3277 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3278 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3279 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3280 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3286 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3287 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3288 mod = _mm_packs_epi32(mod, mod);
3289 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3290 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3292 if (pixel == buffer_FragColorbgra8)
3293 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3299 void DPSOFTRAST_VertexShader_Lightmap(void)
3301 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3302 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3303 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3306 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3309 unsigned char * RESTRICT pixelmask = span->pixelmask;
3310 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3311 int x, startx = span->startx, endx = span->endx;
3312 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3313 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3314 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3315 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3316 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3317 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3318 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3319 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3320 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3321 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3322 pixel = buffer_FragColorbgra8;
3323 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3324 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3325 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3326 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3327 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3328 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3329 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3330 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3332 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3333 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3334 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3335 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3336 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3337 for (x = startx;x < endx;x++)
3339 __m128i color, lightmap, glow, pix;
3340 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3343 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3344 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3345 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3346 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3347 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3348 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3349 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3350 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3351 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3352 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3358 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3359 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3360 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3361 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3362 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3363 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3368 for (x = startx;x < endx;x++)
3370 __m128i color, lightmap, pix;
3371 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3374 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3375 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3376 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3377 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3378 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3379 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3380 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3386 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3387 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3388 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3389 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3392 if (pixel == buffer_FragColorbgra8)
3393 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3399 void DPSOFTRAST_VertexShader_FakeLight(void)
3401 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3404 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3407 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3408 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3409 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3410 memset(buffer_FragColorbgra8, 0, span->length*4);
3411 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3416 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3418 DPSOFTRAST_VertexShader_Lightmap();
3421 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3423 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3429 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3431 DPSOFTRAST_VertexShader_Lightmap();
3434 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3436 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3442 void DPSOFTRAST_VertexShader_LightDirection(void)
3445 int numvertices = dpsoftrast.numvertices;
3447 float LightVector[4];
3448 float EyePosition[4];
3449 float EyeVectorModelSpace[4];
3455 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3456 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3457 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3458 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3459 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3460 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3461 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3462 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3463 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3464 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3465 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3466 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3467 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3468 for (i = 0;i < numvertices;i++)
3470 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3471 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3472 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3473 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3474 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3475 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3476 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3477 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3478 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3479 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3480 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3481 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3482 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3483 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3484 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3485 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3486 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3487 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3488 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3489 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3490 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3491 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3492 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3493 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3494 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3495 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3496 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3497 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3498 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3500 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3503 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3504 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3505 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3506 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3507 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3508 #define DPSOFTRAST_Vector3Normalize(v)\
3511 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3522 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3524 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3525 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3526 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3527 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3528 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3529 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3530 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3531 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3532 int x, startx = span->startx, endx = span->endx;
3533 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3534 float LightVectordata[4];
3535 float LightVectorslope[4];
3536 float EyeVectordata[4];
3537 float EyeVectorslope[4];
3539 float diffusetex[4];
3541 float surfacenormal[4];
3542 float lightnormal[4];
3544 float specularnormal[4];
3547 float SpecularPower;
3549 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3550 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3551 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3552 Color_Glow[3] = 0.0f;
3553 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3554 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3555 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3556 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3557 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3558 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3559 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3560 Color_Pants[3] = 0.0f;
3561 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3562 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3563 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3564 Color_Shirt[3] = 0.0f;
3565 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3566 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3567 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3569 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3570 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3572 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3574 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3576 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3578 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3579 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3580 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3581 Color_Diffuse[3] = 0.0f;
3582 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3583 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3584 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3585 LightColor[3] = 0.0f;
3586 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3587 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3588 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3589 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3590 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3591 Color_Specular[3] = 0.0f;
3592 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3593 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3594 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3595 for (x = startx;x < endx;x++)
3598 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3599 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3600 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3601 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3602 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3604 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3605 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3606 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3607 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3609 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3610 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3611 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3612 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3613 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3614 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3615 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3616 DPSOFTRAST_Vector3Normalize(surfacenormal);
3618 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3619 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3620 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3621 DPSOFTRAST_Vector3Normalize(lightnormal);
3623 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3624 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3625 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3626 DPSOFTRAST_Vector3Normalize(eyenormal);
3628 specularnormal[0] = lightnormal[0] + eyenormal[0];
3629 specularnormal[1] = lightnormal[1] + eyenormal[1];
3630 specularnormal[2] = lightnormal[2] + eyenormal[2];
3631 DPSOFTRAST_Vector3Normalize(specularnormal);
3633 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3634 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3635 specular = pow(specular, SpecularPower * glosstex[3]);
3636 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3638 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3639 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3640 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3641 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3645 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3646 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3647 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3648 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3650 buffer_FragColorbgra8[x*4+0] = d[0];
3651 buffer_FragColorbgra8[x*4+1] = d[1];
3652 buffer_FragColorbgra8[x*4+2] = d[2];
3653 buffer_FragColorbgra8[x*4+3] = d[3];
3656 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3658 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3659 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3660 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3661 Color_Diffuse[3] = 0.0f;
3662 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3663 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3664 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3665 LightColor[3] = 0.0f;
3666 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3667 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3668 for (x = startx;x < endx;x++)
3671 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3672 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3673 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3674 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3675 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3676 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3677 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3678 DPSOFTRAST_Vector3Normalize(surfacenormal);
3680 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3681 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3682 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3683 DPSOFTRAST_Vector3Normalize(lightnormal);
3685 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3686 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3688 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3689 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3690 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3691 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3695 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3696 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3697 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3698 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3700 buffer_FragColorbgra8[x*4+0] = d[0];
3701 buffer_FragColorbgra8[x*4+1] = d[1];
3702 buffer_FragColorbgra8[x*4+2] = d[2];
3703 buffer_FragColorbgra8[x*4+3] = d[3];
3708 for (x = startx;x < endx;x++)
3711 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3712 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3713 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3714 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3716 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3718 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3719 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3720 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3721 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3725 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3726 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3727 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3728 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3730 buffer_FragColorbgra8[x*4+0] = d[0];
3731 buffer_FragColorbgra8[x*4+1] = d[1];
3732 buffer_FragColorbgra8[x*4+2] = d[2];
3733 buffer_FragColorbgra8[x*4+3] = d[3];
3736 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3741 void DPSOFTRAST_VertexShader_LightSource(void)
3744 int numvertices = dpsoftrast.numvertices;
3745 float LightPosition[4];
3746 float LightVector[4];
3747 float LightVectorModelSpace[4];
3748 float EyePosition[4];
3749 float EyeVectorModelSpace[4];
3755 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3756 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3757 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3758 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3759 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3760 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3761 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3762 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3763 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3764 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3765 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3766 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3767 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3768 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3769 for (i = 0;i < numvertices;i++)
3771 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3772 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3773 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3774 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3775 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3776 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3777 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3778 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3779 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3780 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3781 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3782 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3783 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3784 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3785 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3786 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3787 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3788 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
3789 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3790 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3791 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3792 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3793 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3794 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3795 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3796 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3797 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3798 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3799 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3800 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3801 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3802 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3804 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3805 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3808 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3811 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3812 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3813 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3814 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3815 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3816 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3817 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3818 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3819 int x, startx = span->startx, endx = span->endx;
3820 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3821 float CubeVectordata[4];
3822 float CubeVectorslope[4];
3823 float LightVectordata[4];
3824 float LightVectorslope[4];
3825 float EyeVectordata[4];
3826 float EyeVectorslope[4];
3828 float diffusetex[4];
3830 float surfacenormal[4];
3831 float lightnormal[4];
3833 float specularnormal[4];
3836 float SpecularPower;
3837 float CubeVector[4];
3840 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3841 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3842 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3843 Color_Glow[3] = 0.0f;
3844 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3845 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3846 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3847 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3848 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3849 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3850 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3851 Color_Diffuse[3] = 0.0f;
3852 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3853 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3854 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3855 Color_Specular[3] = 0.0f;
3856 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3857 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3858 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3859 Color_Pants[3] = 0.0f;
3860 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3861 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3862 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3863 Color_Shirt[3] = 0.0f;
3864 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3865 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3866 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3867 LightColor[3] = 0.0f;
3868 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3869 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3870 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3871 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3872 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3873 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
3874 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3875 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3877 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3878 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3880 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3881 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
3882 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3884 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3885 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3886 for (x = startx;x < endx;x++)
3889 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3890 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3891 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3892 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3893 if (attenuation < 0.01f)
3895 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3897 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3898 if (attenuation < 0.01f)
3902 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3903 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3904 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3905 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3906 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3908 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3909 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3910 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3911 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3913 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3914 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3915 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3916 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3917 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3918 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3919 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3920 DPSOFTRAST_Vector3Normalize(surfacenormal);
3922 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3923 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3924 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3925 DPSOFTRAST_Vector3Normalize(lightnormal);
3927 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3928 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3929 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3930 DPSOFTRAST_Vector3Normalize(eyenormal);
3932 specularnormal[0] = lightnormal[0] + eyenormal[0];
3933 specularnormal[1] = lightnormal[1] + eyenormal[1];
3934 specularnormal[2] = lightnormal[2] + eyenormal[2];
3935 DPSOFTRAST_Vector3Normalize(specularnormal);
3937 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3938 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3939 specular = pow(specular, SpecularPower * glosstex[3]);
3940 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3942 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3943 attenuation *= (1.0f / 255.0f);
3944 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3945 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3946 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3947 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3951 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
3952 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
3953 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
3954 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3956 buffer_FragColorbgra8[x*4+0] = d[0];
3957 buffer_FragColorbgra8[x*4+1] = d[1];
3958 buffer_FragColorbgra8[x*4+2] = d[2];
3959 buffer_FragColorbgra8[x*4+3] = d[3];
3962 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3964 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3965 for (x = startx;x < endx;x++)
3968 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3969 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3970 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3971 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3972 if (attenuation < 0.01f)
3974 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3976 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3977 if (attenuation < 0.01f)
3981 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3982 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3983 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3984 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3985 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3987 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3988 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3989 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3990 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3992 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3993 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3994 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3995 DPSOFTRAST_Vector3Normalize(surfacenormal);
3997 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3998 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3999 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4000 DPSOFTRAST_Vector3Normalize(lightnormal);
4002 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4003 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4005 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4006 attenuation *= (1.0f / 255.0f);
4007 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4008 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4009 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4010 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4014 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4015 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4016 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4017 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4019 buffer_FragColorbgra8[x*4+0] = d[0];
4020 buffer_FragColorbgra8[x*4+1] = d[1];
4021 buffer_FragColorbgra8[x*4+2] = d[2];
4022 buffer_FragColorbgra8[x*4+3] = d[3];
4027 for (x = startx;x < endx;x++)
4030 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4031 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4032 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4033 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4034 if (attenuation < 0.01f)
4036 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4038 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4039 if (attenuation < 0.01f)
4043 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4044 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4045 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4046 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4047 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4049 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4050 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4051 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4052 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4054 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4056 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4057 attenuation *= (1.0f / 255.0f);
4058 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4059 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4060 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4061 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4065 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4066 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4067 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4068 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4070 buffer_FragColorbgra8[x*4+0] = d[0];
4071 buffer_FragColorbgra8[x*4+1] = d[1];
4072 buffer_FragColorbgra8[x*4+2] = d[2];
4073 buffer_FragColorbgra8[x*4+3] = d[3];
4076 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4082 void DPSOFTRAST_VertexShader_Refraction(void)
4084 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4087 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4090 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4091 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4092 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4093 memset(buffer_FragColorbgra8, 0, span->length*4);
4094 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4099 void DPSOFTRAST_VertexShader_Water(void)
4101 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4105 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4108 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4109 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4110 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4111 memset(buffer_FragColorbgra8, 0, span->length*4);
4112 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4117 void DPSOFTRAST_VertexShader_ShowDepth(void)
4119 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4122 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4125 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4126 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4127 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4128 memset(buffer_FragColorbgra8, 0, span->length*4);
4129 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4134 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4136 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4139 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4142 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4143 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4144 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4145 memset(buffer_FragColorbgra8, 0, span->length*4);
4146 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4151 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4153 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4156 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4159 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4160 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4161 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4162 memset(buffer_FragColorbgra8, 0, span->length*4);
4163 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4168 typedef struct DPSOFTRAST_ShaderModeInfo_s
4171 void (*Vertex)(void);
4172 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4173 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4174 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4176 DPSOFTRAST_ShaderModeInfo;
4178 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4180 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4181 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4182 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4183 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4184 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4185 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4186 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {~0}, {~0}},
4187 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4188 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4189 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4190 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4191 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {~0}},
4192 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4193 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4194 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4195 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}}
4198 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4205 // unsigned int *colorpixel;
4206 unsigned int *depthpixel;
4212 DPSOFTRAST_State_Triangle *triangle;
4213 DPSOFTRAST_State_Span *span;
4214 unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4215 for (i = 0; i < thread->numspans; i++)
4217 span = &thread->spans[i];
4218 triangle = &thread->triangles[span->triangle];
4219 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4221 wslope = triangle->w[0];
4222 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4223 depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4224 depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4225 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4226 switch(thread->fb_depthfunc)
4229 case GL_ALWAYS: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = true; break;
4230 case GL_LESS: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4231 case GL_LEQUAL: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4232 case GL_EQUAL: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4233 case GL_GEQUAL: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4234 case GL_GREATER: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4235 case GL_NEVER: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = false; break;
4237 //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4238 //for (x = 0;x < span->length;x++)
4239 // colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4240 // if there is no color buffer, skip pixel shader
4242 endx = span->length;
4243 while (startx < endx && !pixelmask[startx])
4245 while (endx > startx && !pixelmask[endx-1])
4248 continue; // no pixels to fill
4249 span->pixelmask = pixelmask;
4250 span->startx = startx;
4252 // run pixel shader if appropriate
4253 // do this before running depthmask code, to allow the pixelshader
4254 // to clear pixelmask values for alpha testing
4255 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4256 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4257 if (thread->depthmask)
4258 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4264 // no depth testing means we're just dealing with color...
4265 // if there is no color buffer, skip pixel shader
4266 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4268 memset(pixelmask, 1, span->length);
4269 span->pixelmask = pixelmask;
4271 span->endx = span->length;
4272 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4276 thread->numspans = 0;
4279 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4281 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4284 int cullface = thread->cullface;
4285 int width = dpsoftrast.fb_width;
4286 int miny1 = thread->miny1;
4287 int maxy1 = thread->maxy1;
4288 int miny2 = thread->miny2;
4289 int maxy2 = thread->maxy2;
4290 __m128i fbmin, fbmax;
4291 __m128 viewportcenter, viewportscale;
4292 int firstvertex = command->firstvertex;
4293 int numvertices = command->numvertices;
4294 int numtriangles = command->numtriangles;
4295 const int *element3i = command->element3i;
4296 const unsigned short *element3s = command->element3s;
4297 int clipped = command->clipped;
4304 int starty, endy, bandy;
4308 __m128 triangleedge1, triangleedge2, trianglenormal;
4311 DPSOFTRAST_State_Triangle *triangle;
4312 DPSOFTRAST_Texture *texture;
4313 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4315 if (!ATOMIC_DECREMENT(command->refcount))
4317 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4318 MM_FREE(command->arrays);
4322 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4323 fbmin = _mm_setr_epi16(0, miny1, 0, miny1, 0, miny1, 0, miny1);
4324 fbmax = _mm_sub_epi16(_mm_setr_epi16(width, maxy2, width, maxy2, width, maxy2, width, maxy2), _mm_set1_epi16(1));
4325 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4326 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4327 screen[3] = _mm_setzero_ps();
4328 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4329 for (i = 0;i < numtriangles;i++)
4331 const float *screencoord4f = command->arrays;
4332 const float *arrays = screencoord4f + numvertices*4;
4334 // generate the 3 edges of this triangle
4335 // generate spans for the triangle - switch based on left split or right split classification of triangle
4338 e[0] = element3s[i*3+0] - firstvertex;
4339 e[1] = element3s[i*3+1] - firstvertex;
4340 e[2] = element3s[i*3+2] - firstvertex;
4344 e[0] = element3i[i*3+0] - firstvertex;
4345 e[1] = element3i[i*3+1] - firstvertex;
4346 e[2] = element3i[i*3+2] - firstvertex;
4355 #define SKIPBACKFACE \
4356 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4357 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4358 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4359 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4360 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4364 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4368 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4373 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4374 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4376 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4377 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4379 #define CLIPPEDVERTEXCOPY(k,p1) \
4380 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4382 #define GENATTRIBCOPY(attrib, p1) \
4383 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4384 #define GENATTRIBLERP(attrib, p1, p2) \
4386 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4387 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4389 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4393 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4394 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4395 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4396 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4397 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4398 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4399 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4405 // calculate distance from nearplane
4406 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4407 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4408 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4409 if (clipdist[0] >= 0.0f)
4411 if (clipdist[1] >= 0.0f)
4413 if (clipdist[2] >= 0.0f)
4416 // triangle is entirely in front of nearplane
4417 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4424 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4432 if (clipdist[2] >= 0.0f)
4434 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4441 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4448 else if (clipdist[1] >= 0.0f)
4450 if (clipdist[2] >= 0.0f)
4452 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4459 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4465 else if (clipdist[2] >= 0.0f)
4467 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4472 else continue; // triangle is entirely behind nearplane
4475 // calculate integer y coords for triangle points
4476 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4477 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4478 screenmin = _mm_min_epi16(screeni, screenir),
4479 screenmax = _mm_max_epi16(screeni, screenir);
4480 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4481 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4482 screenmin = _mm_max_epi16(screenmin, fbmin);
4483 screenmax = _mm_min_epi16(screenmax, fbmax);
4484 // skip offscreen triangles
4485 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4487 starty = _mm_extract_epi16(screenmin, 1);
4488 endy = _mm_extract_epi16(screenmax, 1)+1;
4489 if (starty >= maxy1 && endy <= miny2)
4491 screeny = _mm_srai_epi32(screeni, 16);
4494 triangle = &thread->triangles[thread->numtriangles];
4496 // calculate attribute plans for triangle data...
4497 // okay, this triangle is going to produce spans, we'd better project
4498 // the interpolants now (this is what gives perspective texturing),
4499 // this consists of simply multiplying all arrays by the W coord
4500 // (which is basically 1/Z), which will be undone per-pixel
4501 // (multiplying by Z again) to get the perspective-correct array
4504 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4505 __m128 mipedgescale, mipdensity;
4506 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4507 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4508 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4509 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4510 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4511 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4512 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4513 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4514 attribedge1 = _mm_sub_ss(w0, w1);
4515 attribedge2 = _mm_sub_ss(w2, w1);
4516 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4517 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4518 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4519 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4520 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4521 _mm_store_ss(&triangle->w[0], attribxslope);
4522 _mm_store_ss(&triangle->w[1], attribyslope);
4523 _mm_store_ss(&triangle->w[2], attriborigin);
4524 mipedgescale = _mm_setzero_ps();
4525 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4527 __m128 attrib0, attrib1, attrib2;
4528 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4529 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4531 arrays += numvertices*4;
4532 GENATTRIBS(attrib0, attrib1, attrib2);
4533 attriborigin = _mm_mul_ps(attrib1, w1);
4534 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4535 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4536 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4537 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4538 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4539 _mm_stream_ps(triangle->attribs[k][0], attribxslope);
4540 _mm_stream_ps(triangle->attribs[k][1], attribyslope);
4541 _mm_stream_ps(triangle->attribs[k][2], attriborigin);
4542 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4544 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4545 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4546 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4547 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4551 memset(triangle->mip, 0, sizeof(triangle->mip));
4552 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4554 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4555 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4557 texture = thread->texbound[texunit];
4558 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4560 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4561 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4562 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4563 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4564 // this will be multiplied in the texturing routine by the texture resolution
4565 y = _mm_cvtss_si32(mipdensity);
4568 y = (int)(log((float)y)*0.5f/M_LN2);
4569 if (y > texture->mipmaps - 1)
4570 y = texture->mipmaps - 1;
4571 triangle->mip[texunit] = y;
4577 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4580 __m128 xcoords, xslope;
4581 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4582 int yccmask = _mm_movemask_epi8(ycc);
4583 int edge0p, edge0n, edge1p, edge1n;
4590 case 0xFFFF: /*0000*/ y = endy; continue;
4591 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4592 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4593 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4594 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4595 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4596 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4597 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4598 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4599 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4600 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4601 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4602 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4603 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4604 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4605 case 0x0000: /*1111*/ y++; continue;
4613 case 0xFFFF: /*000*/ y = endy; continue;
4614 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4615 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4616 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4617 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4618 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4619 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4620 case 0x0000: /*111*/ y++; continue;
4623 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
4624 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4625 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4626 nexty = _mm_extract_epi16(ycc, 0);
4627 if (nexty >= bandy) nexty = bandy-1;
4628 if (_mm_ucomigt_ss(_mm_max_ss(screen[edge0n], screen[edge0p]), _mm_min_ss(screen[edge1n], screen[edge1p])))
4637 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
4638 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4639 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
4640 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4641 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4642 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4644 int startx, endx, offset;
4645 startx = _mm_cvtss_si32(xcoords);
4646 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4647 if (startx < 0) startx = 0;
4648 if (endx > dpsoftrast.fb_width) endx = dpsoftrast.fb_width;
4649 if (startx >= endx) continue;
4650 for (offset = startx; offset < endx;)
4652 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4653 span->triangle = thread->numtriangles;
4656 span->length = endx - offset;
4657 if (span -> length > DPSOFTRAST_DRAW_MAXSPANLENGTH)
4658 span -> length = DPSOFTRAST_DRAW_MAXSPANLENGTH;
4659 offset += span->length;
4660 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4661 DPSOFTRAST_Draw_ProcessSpans(thread);
4666 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
4668 DPSOFTRAST_Draw_ProcessSpans(thread);
4669 thread->numtriangles = 0;
4673 if (!ATOMIC_DECREMENT(command->refcount))
4675 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4676 MM_FREE(command->arrays);
4679 if (thread->numspans > 0 || thread->numtriangles > 0)
4681 DPSOFTRAST_Draw_ProcessSpans(thread);
4682 thread->numtriangles = 0;
4687 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4691 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
4692 int datasize = 2*numvertices*sizeof(float[4]);
4693 DPSOFTRAST_Command_Draw *command;
4694 unsigned char *data;
4695 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4697 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4698 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4700 datasize += numvertices*sizeof(float[4]);
4703 datasize += numtriangles*sizeof(unsigned short[3]);
4705 datasize += numtriangles*sizeof(int[3]);
4706 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
4707 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
4709 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
4710 data = (unsigned char *)MM_CALLOC(datasize, 1);
4714 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
4715 data = (unsigned char *)command + commandsize;
4717 command->firstvertex = firstvertex;
4718 command->numvertices = numvertices;
4719 command->numtriangles = numtriangles;
4720 command->arrays = (float *)data;
4721 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
4722 dpsoftrast.firstvertex = firstvertex;
4723 dpsoftrast.numvertices = numvertices;
4724 dpsoftrast.screencoord4f = (float *)data;
4725 data += numvertices*sizeof(float[4]);
4726 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
4727 data += numvertices*sizeof(float[4]);
4728 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4730 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4731 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4733 dpsoftrast.post_array4f[j] = (float *)data;
4734 data += numvertices*sizeof(float[4]);
4736 command->element3i = NULL;
4737 command->element3s = NULL;
4740 command->element3s = (unsigned short *)data;
4741 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
4745 command->element3i = (int *)data;
4746 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
4751 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4753 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
4754 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4755 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
4756 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
4757 if (command->starty >= command->endy)
4759 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4760 MM_FREE(command->arrays);
4761 DPSOFTRAST_UndoCommand(command->commandsize);
4764 command->clipped = dpsoftrast.drawclipped;
4765 command->refcount = dpsoftrast.numthreads;
4768 DPSOFTRAST_Draw_SyncCommands();
4771 for (i = 0; i < dpsoftrast.numthreads; i++)
4773 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4774 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
4775 SDL_CondSignal(thread->drawcond);
4779 DPSOFTRAST_Draw_FlushThreads();
4783 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
4785 int commandoffset = thread->commandoffset;
4786 while (commandoffset != endoffset)
4788 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4789 switch (command->opcode)
4791 #define INTERPCOMMAND(name) \
4792 case DPSOFTRAST_OPCODE_##name : \
4793 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4794 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
4795 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4796 commandoffset = 0; \
4798 INTERPCOMMAND(Viewport)
4799 INTERPCOMMAND(ClearColor)
4800 INTERPCOMMAND(ClearDepth)
4801 INTERPCOMMAND(ColorMask)
4802 INTERPCOMMAND(DepthTest)
4803 INTERPCOMMAND(ScissorTest)
4804 INTERPCOMMAND(Scissor)
4805 INTERPCOMMAND(BlendFunc)
4806 INTERPCOMMAND(BlendSubtract)
4807 INTERPCOMMAND(DepthMask)
4808 INTERPCOMMAND(DepthFunc)
4809 INTERPCOMMAND(DepthRange)
4810 INTERPCOMMAND(PolygonOffset)
4811 INTERPCOMMAND(CullFace)
4812 INTERPCOMMAND(AlphaTest)
4813 INTERPCOMMAND(AlphaFunc)
4814 INTERPCOMMAND(SetTexture)
4815 INTERPCOMMAND(SetShader)
4816 INTERPCOMMAND(Uniform4f)
4817 INTERPCOMMAND(UniformMatrix4f)
4818 INTERPCOMMAND(Uniform1i)
4820 case DPSOFTRAST_OPCODE_Draw:
4821 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
4822 commandoffset += command->commandsize;
4823 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
4825 thread->commandoffset = commandoffset;
4828 case DPSOFTRAST_OPCODE_Reset:
4833 thread->commandoffset = commandoffset;
4837 static int DPSOFTRAST_Draw_Thread(void *data)
4839 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4840 while(thread->index >= 0)
4842 if (thread->commandoffset != dpsoftrast.drawcommand)
4844 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4848 SDL_LockMutex(thread->drawmutex);
4849 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
4851 if (thread->waiting) SDL_CondSignal(thread->waitcond);
4852 thread->starving = true;
4853 SDL_CondWait(thread->drawcond, thread->drawmutex);
4854 thread->starving = false;
4856 SDL_UnlockMutex(thread->drawmutex);
4863 static void DPSOFTRAST_Draw_FlushThreads(void)
4865 DPSOFTRAST_State_Thread *thread;
4867 DPSOFTRAST_Draw_SyncCommands();
4869 for (i = 0; i < dpsoftrast.numthreads; i++)
4871 thread = &dpsoftrast.threads[i];
4872 if (thread->commandoffset != dpsoftrast.drawcommand)
4874 SDL_LockMutex(thread->drawmutex);
4875 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
4876 SDL_CondSignal(thread->drawcond);
4877 SDL_UnlockMutex(thread->drawmutex);
4881 for (i = 0; i < dpsoftrast.numthreads; i++)
4883 thread = &dpsoftrast.threads[i];
4885 if (thread->commandoffset != dpsoftrast.drawcommand)
4887 SDL_LockMutex(thread->drawmutex);
4888 if (thread->commandoffset != dpsoftrast.drawcommand)
4890 thread->waiting = true;
4891 SDL_CondWait(thread->waitcond, thread->drawmutex);
4892 thread->waiting = false;
4894 SDL_UnlockMutex(thread->drawmutex);
4897 if (thread->commandoffset != dpsoftrast.drawcommand)
4898 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4901 dpsoftrast.commandpool.usedcommands = 0;
4904 void DPSOFTRAST_Flush(void)
4906 DPSOFTRAST_Draw_FlushThreads();
4909 void DPSOFTRAST_Finish(void)
4914 void DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
4924 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4925 dpsoftrast.bigendian = u.b[3];
4926 dpsoftrast.fb_width = width;
4927 dpsoftrast.fb_height = height;
4928 dpsoftrast.fb_depthpixels = depthpixels;
4929 dpsoftrast.fb_colorpixels[0] = colorpixels;
4930 dpsoftrast.fb_colorpixels[1] = NULL;
4931 dpsoftrast.fb_colorpixels[1] = NULL;
4932 dpsoftrast.fb_colorpixels[1] = NULL;
4933 dpsoftrast.viewport[0] = 0;
4934 dpsoftrast.viewport[1] = 0;
4935 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
4936 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
4937 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
4938 dpsoftrast.texture_firstfree = 1;
4939 dpsoftrast.texture_end = 1;
4940 dpsoftrast.texture_max = 0;
4941 dpsoftrast.color[0] = 1;
4942 dpsoftrast.color[1] = 1;
4943 dpsoftrast.color[2] = 1;
4944 dpsoftrast.color[3] = 1;
4945 dpsoftrast.interlace = bound(0, interlace, 1);
4947 dpsoftrast.numthreads = bound(1, numthreads, 64);
4949 dpsoftrast.numthreads = 1;
4951 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
4952 for (i = 0; i < dpsoftrast.numthreads; i++)
4954 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4956 thread->cullface = GL_BACK;
4957 thread->colormask[1] = 1;
4958 thread->colormask[2] = 1;
4959 thread->colormask[3] = 1;
4960 thread->blendfunc[0] = GL_ONE;
4961 thread->blendfunc[1] = GL_ZERO;
4962 thread->depthmask = true;
4963 thread->depthtest = true;
4964 thread->depthfunc = GL_LEQUAL;
4965 thread->scissortest = false;
4966 thread->alphatest = false;
4967 thread->alphafunc = GL_GREATER;
4968 thread->alphavalue = 0.5f;
4969 thread->viewport[0] = 0;
4970 thread->viewport[1] = 0;
4971 thread->viewport[2] = dpsoftrast.fb_width;
4972 thread->viewport[3] = dpsoftrast.fb_height;
4973 thread->scissor[0] = 0;
4974 thread->scissor[1] = 0;
4975 thread->scissor[2] = dpsoftrast.fb_width;
4976 thread->scissor[3] = dpsoftrast.fb_height;
4977 thread->depthrange[0] = 0;
4978 thread->depthrange[1] = 1;
4979 thread->polygonoffset[0] = 0;
4980 thread->polygonoffset[1] = 0;
4982 if (dpsoftrast.interlace)
4984 thread->miny1 = (i*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
4985 thread->maxy1 = ((i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
4986 thread->miny2 = ((dpsoftrast.numthreads+i)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
4987 thread->maxy2 = ((dpsoftrast.numthreads+i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
4991 thread->miny1 = thread->miny2 = (i*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4992 thread->maxy1 = thread->maxy2 = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4995 thread->numspans = 0;
4996 thread->numtriangles = 0;
4997 thread->commandoffset = 0;
4998 thread->waiting = false;
4999 thread->starving = false;
5001 thread->waitcond = SDL_CreateCond();
5002 thread->drawcond = SDL_CreateCond();
5003 thread->drawmutex = SDL_CreateMutex();
5006 thread->validate = -1;
5007 DPSOFTRAST_Validate(thread, -1);
5009 thread->thread = SDL_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5014 void DPSOFTRAST_Shutdown(void)
5018 if (dpsoftrast.numthreads > 0)
5020 DPSOFTRAST_State_Thread *thread;
5021 for (i = 0; i < dpsoftrast.numthreads; i++)
5023 thread = &dpsoftrast.threads[i];
5024 SDL_LockMutex(thread->drawmutex);
5026 SDL_CondSignal(thread->drawcond);
5027 SDL_UnlockMutex(thread->drawmutex);
5028 SDL_WaitThread(thread->thread, NULL);
5029 SDL_DestroyCond(thread->waitcond);
5030 SDL_DestroyCond(thread->drawcond);
5031 SDL_DestroyMutex(thread->drawmutex);
5035 for (i = 0;i < dpsoftrast.texture_end;i++)
5036 if (dpsoftrast.texture[i].bytes)
5037 MM_FREE(dpsoftrast.texture[i].bytes);
5038 if (dpsoftrast.texture)
5039 free(dpsoftrast.texture);
5040 if (dpsoftrast.threads)
5041 MM_FREE(dpsoftrast.threads);
5042 memset(&dpsoftrast, 0, sizeof(dpsoftrast));