3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 typedef qboolean bool;
14 #define ATOMIC_SIZE 32
18 #define ALIGN(var) var __attribute__((__aligned__(16)))
19 #define ATOMIC(var) var __attribute__((__aligned__(32)))
20 #define MEMORY_BARRIER (_mm_sfence())
21 //(__sync_synchronize())
22 #define ATOMIC_COUNTER volatile int
23 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
24 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
25 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
26 #elif defined(_MSC_VER)
27 #define ALIGN(var) __declspec(align(16)) var
28 #define ATOMIC(var) __declspec(align(32)) var
29 #define MEMORY_BARRIER (_mm_sfence())
31 #define ATOMIC_COUNTER volatile LONG
32 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
33 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
34 #define ATOMIC_ADD(counter, val) (InterlockedExchangeAdd(&(counter), (val)))
39 #define ALIGN(var) var
42 #define ATOMIC(var) var
44 #ifndef MEMORY_BARRIER
45 #define MEMORY_BARRIER ((void)0)
47 #ifndef ATOMIC_COUNTER
48 #define ATOMIC_COUNTER int
50 #ifndef ATOMIC_INCREMENT
51 #define ATOMIC_INCREMENT(counter) (++(counter))
53 #ifndef ATOMIC_DECREMENT
54 #define ATOMIC_DECREMENT(counter) (--(counter))
57 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
61 #include <emmintrin.h>
63 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
65 static void *MM_CALLOC(size_t nmemb, size_t size)
67 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
68 if (ptr != NULL) memset(ptr, 0, nmemb*size);
72 #define MM_FREE _mm_free
74 #define MM_MALLOC(size) malloc(size)
75 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
79 typedef enum DPSOFTRAST_ARRAY_e
81 DPSOFTRAST_ARRAY_POSITION,
82 DPSOFTRAST_ARRAY_COLOR,
83 DPSOFTRAST_ARRAY_TEXCOORD0,
84 DPSOFTRAST_ARRAY_TEXCOORD1,
85 DPSOFTRAST_ARRAY_TEXCOORD2,
86 DPSOFTRAST_ARRAY_TEXCOORD3,
87 DPSOFTRAST_ARRAY_TEXCOORD4,
88 DPSOFTRAST_ARRAY_TEXCOORD5,
89 DPSOFTRAST_ARRAY_TEXCOORD6,
90 DPSOFTRAST_ARRAY_TEXCOORD7,
91 DPSOFTRAST_ARRAY_TOTAL
95 typedef struct DPSOFTRAST_Texture_s
102 DPSOFTRAST_TEXTURE_FILTER filter;
105 ATOMIC_COUNTER binds;
106 unsigned char *bytes;
107 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
111 #define COMMAND_SIZE ALIGN_SIZE
112 #define COMMAND_ALIGN(var) ALIGN(var)
114 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
116 unsigned char opcode;
117 unsigned short commandsize;
121 enum { DPSOFTRAST_OPCODE_Reset = 0 };
123 #define DEFCOMMAND(opcodeval, name, fields) \
124 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
125 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
127 unsigned char opcode; \
128 unsigned short commandsize; \
130 } DPSOFTRAST_Command_##name );
132 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
133 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
135 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
139 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
141 DPSOFTRAST_State_Command_Pool);
143 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
145 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
147 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
149 DPSOFTRAST_State_Triangle);
151 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
152 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
153 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
154 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
155 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
157 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
158 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
159 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
160 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
161 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
162 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
163 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
164 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
165 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
168 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
170 typedef ALIGN(struct DPSOFTRAST_State_Span_s
172 int triangle; // triangle this span was generated by
173 int x; // framebuffer x coord
174 int y; // framebuffer y coord
175 int startx; // usable range (according to pixelmask)
176 int endx; // usable range (according to pixelmask)
177 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
179 DPSOFTRAST_State_Span);
181 #define DPSOFTRAST_DRAW_MAXSPANS 1024
182 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
184 #define DPSOFTRAST_VALIDATE_FB 1
185 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
186 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
187 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
189 typedef enum DPSOFTRAST_BLENDMODE_e
191 DPSOFTRAST_BLENDMODE_OPAQUE,
192 DPSOFTRAST_BLENDMODE_ALPHA,
193 DPSOFTRAST_BLENDMODE_ADDALPHA,
194 DPSOFTRAST_BLENDMODE_ADD,
195 DPSOFTRAST_BLENDMODE_INVMOD,
196 DPSOFTRAST_BLENDMODE_MUL,
197 DPSOFTRAST_BLENDMODE_MUL2,
198 DPSOFTRAST_BLENDMODE_SUBALPHA,
199 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
200 DPSOFTRAST_BLENDMODE_INVADD,
201 DPSOFTRAST_BLENDMODE_TOTAL
203 DPSOFTRAST_BLENDMODE;
205 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
224 float polygonoffset[2];
227 int shader_permutation;
229 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
231 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
232 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
234 // DPSOFTRAST_VALIDATE_ flags
237 // derived values (DPSOFTRAST_VALIDATE_FB)
240 ALIGN(float fb_viewportcenter[4]);
241 ALIGN(float fb_viewportscale[4]);
243 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
246 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
255 ATOMIC(volatile int commandoffset);
257 volatile bool waiting;
258 volatile bool starving;
265 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
266 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
268 DPSOFTRAST_State_Thread);
270 typedef ATOMIC(struct DPSOFTRAST_State_s
274 unsigned int *fb_depthpixels;
275 unsigned int *fb_colorpixels[4];
278 ALIGN(float fb_viewportcenter[4]);
279 ALIGN(float fb_viewportscale[4]);
282 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
283 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
285 const float *pointer_vertex3f;
286 const float *pointer_color4f;
287 const unsigned char *pointer_color4ub;
288 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
291 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
292 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
293 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
297 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
298 float *screencoord4f;
304 int shader_permutation;
308 int texture_firstfree;
309 DPSOFTRAST_Texture *texture;
314 const char *errorstring;
319 DPSOFTRAST_State_Thread *threads;
321 ATOMIC(volatile int drawcommand);
323 DPSOFTRAST_State_Command_Pool commandpool;
327 DPSOFTRAST_State dpsoftrast;
329 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
330 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
331 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
332 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
333 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
335 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
337 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
338 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
339 fb_viewportcenter[3] = 0.5f;
340 fb_viewportcenter[0] = 0.0f;
341 fb_viewportscale[1] = 0.5f * viewport[2];
342 fb_viewportscale[2] = -0.5f * viewport[3];
343 fb_viewportscale[3] = 0.5f;
344 fb_viewportscale[0] = 1.0f;
347 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
349 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
350 // and viewport projection values
353 x1 = thread->scissor[0];
354 x2 = thread->scissor[0] + thread->scissor[2];
355 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
356 y2 = dpsoftrast.fb_height - thread->scissor[1];
357 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
359 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
361 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
362 thread->fb_scissor[0] = x1;
363 thread->fb_scissor[1] = y1;
364 thread->fb_scissor[2] = x2 - x1;
365 thread->fb_scissor[3] = y2 - y1;
367 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
370 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
372 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
375 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
377 if (thread->blendsubtract)
379 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
381 #define BLENDFUNC(sfactor, dfactor, blendmode) \
382 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
383 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
384 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
389 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
391 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
392 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
393 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
394 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
395 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
396 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
397 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
398 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
399 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
400 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
401 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
406 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
408 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
410 mask &= thread->validate;
413 if (mask & DPSOFTRAST_VALIDATE_FB)
415 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
416 DPSOFTRAST_RecalcFB(thread);
418 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
420 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
421 DPSOFTRAST_RecalcDepthFunc(thread);
423 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
425 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
426 DPSOFTRAST_RecalcBlendFunc(thread);
430 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
432 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
433 return &dpsoftrast.texture[index];
437 static void DPSOFTRAST_Texture_Grow(void)
439 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
440 DPSOFTRAST_State_Thread *thread;
444 // expand texture array as needed
445 if (dpsoftrast.texture_max < 1024)
446 dpsoftrast.texture_max = 1024;
448 dpsoftrast.texture_max *= 2;
449 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
450 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
451 if (dpsoftrast.texbound[i])
452 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
453 for (j = 0; j < dpsoftrast.numthreads; j++)
455 thread = &dpsoftrast.threads[j];
456 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
457 if (thread->texbound[i])
458 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
462 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
471 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
472 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
473 DPSOFTRAST_Texture *texture;
474 if (width*height*depth < 1)
476 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
479 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
481 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
486 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
487 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
488 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
490 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
491 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
493 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
498 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
501 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
503 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
508 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
510 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
513 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
515 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
518 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
520 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
523 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
525 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
528 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
530 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
533 // find first empty slot in texture array
534 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
535 if (!dpsoftrast.texture[texnum].bytes)
537 dpsoftrast.texture_firstfree = texnum + 1;
538 if (dpsoftrast.texture_max <= texnum)
539 DPSOFTRAST_Texture_Grow();
540 if (dpsoftrast.texture_end <= texnum)
541 dpsoftrast.texture_end = texnum + 1;
542 texture = &dpsoftrast.texture[texnum];
543 memset(texture, 0, sizeof(*texture));
544 texture->flags = flags;
545 texture->width = width;
546 texture->height = height;
547 texture->depth = depth;
548 texture->sides = sides;
560 s = w * h * d * sides * 4;
561 texture->mipmap[mipmaps][0] = size;
562 texture->mipmap[mipmaps][1] = s;
563 texture->mipmap[mipmaps][2] = w;
564 texture->mipmap[mipmaps][3] = h;
565 texture->mipmap[mipmaps][4] = d;
568 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
574 texture->mipmaps = mipmaps;
575 texture->size = size;
577 // allocate the pixels now
578 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
582 void DPSOFTRAST_Texture_Free(int index)
584 DPSOFTRAST_Texture *texture;
585 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
589 MM_FREE(texture->bytes);
590 texture->bytes = NULL;
591 memset(texture, 0, sizeof(*texture));
592 // adjust the free range and used range
593 if (dpsoftrast.texture_firstfree > index)
594 dpsoftrast.texture_firstfree = index;
595 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
596 dpsoftrast.texture_end--;
598 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
600 int i, x, y, z, w, layer0, layer1, row0, row1;
601 unsigned char *o, *i0, *i1, *i2, *i3;
602 DPSOFTRAST_Texture *texture;
603 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
604 if (texture->mipmaps <= 1)
606 for (i = 1;i < texture->mipmaps;i++)
608 for (z = 0;z < texture->mipmap[i][4];z++)
612 if (layer1 >= texture->mipmap[i-1][4])
613 layer1 = texture->mipmap[i-1][4]-1;
614 for (y = 0;y < texture->mipmap[i][3];y++)
618 if (row1 >= texture->mipmap[i-1][3])
619 row1 = texture->mipmap[i-1][3]-1;
620 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
621 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
622 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
623 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
624 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
625 w = texture->mipmap[i][2];
628 if (texture->mipmap[i-1][2] > 1)
630 // average 3D texture
631 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
633 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
634 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
635 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
636 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
641 // average 3D mipmap with parent width == 1
642 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
644 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
645 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
646 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
647 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
653 if (texture->mipmap[i-1][2] > 1)
655 // average 2D texture (common case)
656 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
658 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
659 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
660 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
661 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
666 // 2D texture with parent width == 1
667 o[0] = (i0[0] + i1[0] + 1) >> 1;
668 o[1] = (i0[1] + i1[1] + 1) >> 1;
669 o[2] = (i0[2] + i1[2] + 1) >> 1;
670 o[3] = (i0[3] + i1[3] + 1) >> 1;
677 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
679 DPSOFTRAST_Texture *texture;
681 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
684 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
685 while (blockheight > 0)
687 memcpy(dst, pixels, blockwidth * 4);
688 pixels += blockwidth * 4;
689 dst += texture->mipmap[0][2] * 4;
692 DPSOFTRAST_Texture_CalculateMipmaps(index);
694 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
696 DPSOFTRAST_Texture *texture;
697 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
700 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
701 DPSOFTRAST_Texture_CalculateMipmaps(index);
703 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
705 DPSOFTRAST_Texture *texture;
706 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
707 return texture->mipmap[mip][2];
709 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
711 DPSOFTRAST_Texture *texture;
712 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
713 return texture->mipmap[mip][3];
715 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
717 DPSOFTRAST_Texture *texture;
718 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
719 return texture->mipmap[mip][4];
721 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
723 DPSOFTRAST_Texture *texture;
724 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
727 return texture->bytes + texture->mipmap[mip][0];
729 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
731 DPSOFTRAST_Texture *texture;
732 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
733 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
735 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
740 texture->filter = filter;
743 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
745 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
746 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
747 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
749 dpsoftrast.fb_width = width;
750 dpsoftrast.fb_height = height;
751 dpsoftrast.fb_depthpixels = depthpixels;
752 dpsoftrast.fb_colorpixels[0] = colorpixels0;
753 dpsoftrast.fb_colorpixels[1] = colorpixels1;
754 dpsoftrast.fb_colorpixels[2] = colorpixels2;
755 dpsoftrast.fb_colorpixels[3] = colorpixels3;
758 static void DPSOFTRAST_Draw_FlushThreads(void);
760 static void DPSOFTRAST_Draw_SyncCommands(void)
762 if(dpsoftrast.usethreads) MEMORY_BARRIER;
763 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
766 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
768 DPSOFTRAST_State_Thread *thread;
770 int freecommand = dpsoftrast.commandpool.freecommand;
771 int usedcommands = dpsoftrast.commandpool.usedcommands;
772 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
774 DPSOFTRAST_Draw_SyncCommands();
780 for (i = 0; i < dpsoftrast.numthreads; i++)
782 thread = &dpsoftrast.threads[i];
783 commandoffset = freecommand - thread->commandoffset;
784 if (commandoffset < 0)
785 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
786 if (commandoffset > usedcommands)
789 usedcommands = commandoffset;
792 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
794 thread = &dpsoftrast.threads[waitindex];
795 Thread_LockMutex(thread->drawmutex);
796 if (thread->commandoffset != dpsoftrast.drawcommand)
798 thread->waiting = true;
799 if (thread->starving) Thread_CondSignal(thread->drawcond);
800 Thread_CondWait(thread->waitcond, thread->drawmutex);
801 thread->waiting = false;
803 Thread_UnlockMutex(thread->drawmutex);
805 dpsoftrast.commandpool.usedcommands = usedcommands;
808 #define DPSOFTRAST_ALIGNCOMMAND(size) \
809 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
810 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
811 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
813 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
815 DPSOFTRAST_Command *command;
816 int freecommand = dpsoftrast.commandpool.freecommand;
817 int usedcommands = dpsoftrast.commandpool.usedcommands;
818 int extra = sizeof(DPSOFTRAST_Command);
819 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
820 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
821 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
823 if (dpsoftrast.usethreads)
824 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
826 DPSOFTRAST_Draw_FlushThreads();
827 freecommand = dpsoftrast.commandpool.freecommand;
828 usedcommands = dpsoftrast.commandpool.usedcommands;
830 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
832 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
833 command->opcode = DPSOFTRAST_OPCODE_Reset;
834 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
837 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
838 command->opcode = opcode;
839 command->commandsize = size;
841 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
843 dpsoftrast.commandpool.freecommand = freecommand;
844 dpsoftrast.commandpool.usedcommands = usedcommands + size;
848 static void DPSOFTRAST_UndoCommand(int size)
850 int freecommand = dpsoftrast.commandpool.freecommand;
851 int usedcommands = dpsoftrast.commandpool.usedcommands;
854 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
855 usedcommands -= size;
856 dpsoftrast.commandpool.freecommand = freecommand;
857 dpsoftrast.commandpool.usedcommands = usedcommands;
860 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
861 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
863 thread->viewport[0] = command->x;
864 thread->viewport[1] = command->y;
865 thread->viewport[2] = command->width;
866 thread->viewport[3] = command->height;
867 thread->validate |= DPSOFTRAST_VALIDATE_FB;
869 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
871 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
874 command->width = width;
875 command->height = height;
877 dpsoftrast.viewport[0] = x;
878 dpsoftrast.viewport[1] = y;
879 dpsoftrast.viewport[2] = width;
880 dpsoftrast.viewport[3] = height;
881 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
884 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
885 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
887 int i, x1, y1, x2, y2, w, h, x, y;
888 int miny1 = thread->miny1;
889 int maxy1 = thread->maxy1;
890 int miny2 = thread->miny2;
891 int maxy2 = thread->maxy2;
895 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
896 x1 = thread->fb_scissor[0];
897 y1 = thread->fb_scissor[1];
898 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
899 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
900 if (y1 < miny1) y1 = miny1;
901 if (y2 > maxy2) y2 = maxy2;
906 // FIXME: honor fb_colormask?
907 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
908 for (i = 0;i < 4;i++)
910 if (!dpsoftrast.fb_colorpixels[i])
912 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
915 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
916 for (x = x1;x < x2;x++)
921 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
923 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
930 DEFCOMMAND(3, ClearDepth, float depth;)
931 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
933 int x1, y1, x2, y2, w, h, x, y;
934 int miny1 = thread->miny1;
935 int maxy1 = thread->maxy1;
936 int miny2 = thread->miny2;
937 int maxy2 = thread->maxy2;
941 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
942 x1 = thread->fb_scissor[0];
943 y1 = thread->fb_scissor[1];
944 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
945 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
946 if (y1 < miny1) y1 = miny1;
947 if (y2 > maxy2) y2 = maxy2;
952 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
953 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
956 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
957 for (x = x1;x < x2;x++)
961 void DPSOFTRAST_ClearDepth(float d)
963 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
967 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
968 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
970 thread->colormask[0] = command->r != 0;
971 thread->colormask[1] = command->g != 0;
972 thread->colormask[2] = command->b != 0;
973 thread->colormask[3] = command->a != 0;
974 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
976 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
978 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
985 DEFCOMMAND(5, DepthTest, int enable;)
986 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
988 thread->depthtest = command->enable;
989 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
991 void DPSOFTRAST_DepthTest(int enable)
993 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
994 command->enable = enable;
997 DEFCOMMAND(6, ScissorTest, int enable;)
998 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1000 thread->scissortest = command->enable;
1001 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1003 void DPSOFTRAST_ScissorTest(int enable)
1005 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1006 command->enable = enable;
1009 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1010 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1012 thread->scissor[0] = command->x;
1013 thread->scissor[1] = command->y;
1014 thread->scissor[2] = command->width;
1015 thread->scissor[3] = command->height;
1016 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1018 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1020 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1023 command->width = width;
1024 command->height = height;
1027 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1028 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1030 thread->blendfunc[0] = command->sfactor;
1031 thread->blendfunc[1] = command->dfactor;
1032 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1034 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1036 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1037 command->sfactor = sfactor;
1038 command->dfactor = dfactor;
1041 DEFCOMMAND(9, BlendSubtract, int enable;)
1042 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1044 thread->blendsubtract = command->enable;
1045 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1047 void DPSOFTRAST_BlendSubtract(int enable)
1049 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1050 command->enable = enable;
1053 DEFCOMMAND(10, DepthMask, int enable;)
1054 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1056 thread->depthmask = command->enable;
1058 void DPSOFTRAST_DepthMask(int enable)
1060 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1061 command->enable = enable;
1064 DEFCOMMAND(11, DepthFunc, int func;)
1065 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1067 thread->depthfunc = command->func;
1069 void DPSOFTRAST_DepthFunc(int func)
1071 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1072 command->func = func;
1075 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1076 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1078 thread->depthrange[0] = command->nearval;
1079 thread->depthrange[1] = command->farval;
1081 void DPSOFTRAST_DepthRange(float nearval, float farval)
1083 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1084 command->nearval = nearval;
1085 command->farval = farval;
1088 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1089 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1091 thread->polygonoffset[0] = command->alongnormal;
1092 thread->polygonoffset[1] = command->intoview;
1094 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1096 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1097 command->alongnormal = alongnormal;
1098 command->intoview = intoview;
1101 DEFCOMMAND(14, CullFace, int mode;)
1102 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1104 thread->cullface = command->mode;
1106 void DPSOFTRAST_CullFace(int mode)
1108 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1109 command->mode = mode;
1112 DEFCOMMAND(15, AlphaTest, int enable;)
1113 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1115 thread->alphatest = command->enable;
1117 void DPSOFTRAST_AlphaTest(int enable)
1119 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1120 command->enable = enable;
1123 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1124 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1126 thread->alphafunc = command->func;
1127 thread->alphavalue = command->ref;
1129 void DPSOFTRAST_AlphaFunc(int func, float ref)
1131 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1132 command->func = func;
1136 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1138 dpsoftrast.color[0] = r;
1139 dpsoftrast.color[1] = g;
1140 dpsoftrast.color[2] = b;
1141 dpsoftrast.color[3] = a;
1144 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1146 int outstride = blockwidth * 4;
1147 int instride = dpsoftrast.fb_width * 4;
1150 int bx2 = blockx + blockwidth;
1151 int by2 = blocky + blockheight;
1155 unsigned char *inpixels;
1159 if (bx1 < 0) bx1 = 0;
1160 if (by1 < 0) by1 = 0;
1161 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1162 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1164 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1165 if (dpsoftrast.bigendian)
1167 for (y = by1;y < by2;y++)
1169 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1170 o = (unsigned char *)outpixels + (y - by1) * outstride;
1171 for (x = bx1;x < bx2;x++)
1184 for (y = by1;y < by2;y++)
1186 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1187 o = (unsigned char *)outpixels + (y - by1) * outstride;
1193 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1197 int tx2 = tx + width;
1198 int ty2 = ty + height;
1201 int sx2 = sx + width;
1202 int sy2 = sy + height;
1212 unsigned int *spixels;
1213 unsigned int *tpixels;
1214 DPSOFTRAST_Texture *texture;
1215 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1216 if (mip < 0 || mip >= texture->mipmaps) return;
1218 spixels = dpsoftrast.fb_colorpixels[0];
1219 swidth = dpsoftrast.fb_width;
1220 sheight = dpsoftrast.fb_height;
1221 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1222 twidth = texture->mipmap[mip][2];
1223 theight = texture->mipmap[mip][3];
1224 if (tx1 < 0) tx1 = 0;
1225 if (ty1 < 0) ty1 = 0;
1226 if (tx2 > twidth) tx2 = twidth;
1227 if (ty2 > theight) ty2 = theight;
1228 if (sx1 < 0) sx1 = 0;
1229 if (sy1 < 0) sy1 = 0;
1230 if (sx2 > swidth) sx2 = swidth;
1231 if (sy2 > sheight) sy2 = sheight;
1236 if (tw > sw) tw = sw;
1237 if (th > sh) th = sh;
1238 if (tw < 1 || th < 1)
1240 for (y = 0;y < th;y++)
1241 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1242 if (texture->mipmaps > 1)
1243 DPSOFTRAST_Texture_CalculateMipmaps(index);
1246 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1247 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1249 if (thread->texbound[command->unitnum])
1250 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1251 thread->texbound[command->unitnum] = command->texture;
1253 void DPSOFTRAST_SetTexture(int unitnum, int index)
1255 DPSOFTRAST_Command_SetTexture *command;
1256 DPSOFTRAST_Texture *texture;
1257 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1259 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1262 texture = DPSOFTRAST_Texture_GetByIndex(index);
1263 if (index && !texture)
1265 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1269 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1270 command->unitnum = unitnum;
1271 command->texture = texture;
1273 dpsoftrast.texbound[unitnum] = texture;
1274 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1277 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1279 dpsoftrast.pointer_vertex3f = vertex3f;
1280 dpsoftrast.stride_vertex = stride;
1282 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1284 dpsoftrast.pointer_color4f = color4f;
1285 dpsoftrast.pointer_color4ub = NULL;
1286 dpsoftrast.stride_color = stride;
1288 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1290 dpsoftrast.pointer_color4f = NULL;
1291 dpsoftrast.pointer_color4ub = color4ub;
1292 dpsoftrast.stride_color = stride;
1294 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1296 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1297 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1298 dpsoftrast.stride_texcoord[unitnum] = stride;
1301 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1302 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1304 thread->shader_mode = command->mode;
1305 thread->shader_permutation = command->permutation;
1307 void DPSOFTRAST_SetShader(int mode, int permutation)
1309 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1310 command->mode = mode;
1311 command->permutation = permutation;
1313 dpsoftrast.shader_mode = mode;
1314 dpsoftrast.shader_permutation = permutation;
1317 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1318 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1320 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1322 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1324 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1325 command->index = index;
1326 command->val[0] = v0;
1327 command->val[1] = v1;
1328 command->val[2] = v2;
1329 command->val[3] = v3;
1331 dpsoftrast.uniform4f[index*4+0] = v0;
1332 dpsoftrast.uniform4f[index*4+1] = v1;
1333 dpsoftrast.uniform4f[index*4+2] = v2;
1334 dpsoftrast.uniform4f[index*4+3] = v3;
1336 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1338 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1339 command->index = index;
1340 memcpy(command->val, v, sizeof(command->val));
1342 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1345 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1346 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1348 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1350 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1354 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1356 __m128 m0, m1, m2, m3;
1357 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1358 command->index = (DPSOFTRAST_UNIFORM)index;
1359 if (((size_t)v)&(ALIGN_SIZE-1))
1361 m0 = _mm_loadu_ps(v);
1362 m1 = _mm_loadu_ps(v+4);
1363 m2 = _mm_loadu_ps(v+8);
1364 m3 = _mm_loadu_ps(v+12);
1368 m0 = _mm_load_ps(v);
1369 m1 = _mm_load_ps(v+4);
1370 m2 = _mm_load_ps(v+8);
1371 m3 = _mm_load_ps(v+12);
1375 __m128 t0, t1, t2, t3;
1376 t0 = _mm_unpacklo_ps(m0, m1);
1377 t1 = _mm_unpacklo_ps(m2, m3);
1378 t2 = _mm_unpackhi_ps(m0, m1);
1379 t3 = _mm_unpackhi_ps(m2, m3);
1380 m0 = _mm_movelh_ps(t0, t1);
1381 m1 = _mm_movehl_ps(t1, t0);
1382 m2 = _mm_movelh_ps(t2, t3);
1383 m3 = _mm_movehl_ps(t3, t2);
1385 _mm_store_ps(command->val, m0);
1386 _mm_store_ps(command->val+4, m1);
1387 _mm_store_ps(command->val+8, m2);
1388 _mm_store_ps(command->val+12, m3);
1389 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1390 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1391 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1392 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1397 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1398 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1400 thread->uniform1i[command->index] = command->val;
1402 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1404 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1405 command->index = index;
1408 dpsoftrast.uniform1i[command->index] = i0;
1412 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1414 float *end = dst + size*4;
1415 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1419 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1428 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1435 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1437 float *end = dst + size*4;
1438 if (stride == sizeof(float[3]))
1440 float *end4 = dst + (size&~3)*4;
1441 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1445 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1446 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1447 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1448 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1449 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1450 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1451 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1452 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1453 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1454 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1455 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1456 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1457 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1459 src += 4*sizeof(float[3]);
1466 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1467 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1468 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1469 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1470 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1471 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1472 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1473 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1474 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1475 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1476 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1477 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1478 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1480 src += 4*sizeof(float[3]);
1484 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1488 __m128 v = _mm_loadu_ps((const float *)src);
1489 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1490 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1491 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1492 _mm_store_ps(dst, v);
1501 __m128 v = _mm_load_ps((const float *)src);
1502 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1503 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1504 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1505 _mm_store_ps(dst, v);
1512 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1514 float *end = dst + size*4;
1515 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1516 if (stride == sizeof(float[2]))
1518 float *end2 = dst + (size&~1)*4;
1519 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1523 __m128 v = _mm_loadu_ps((const float *)src);
1524 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1525 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1527 src += 2*sizeof(float[2]);
1534 __m128 v = _mm_load_ps((const float *)src);
1535 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1536 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1538 src += 2*sizeof(float[2]);
1544 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1550 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1552 float *end = dst + size*4;
1553 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1554 if (stride == sizeof(unsigned char[4]))
1556 float *end4 = dst + (size&~3)*4;
1557 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1561 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1562 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1563 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1564 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1565 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1567 src += 4*sizeof(unsigned char[4]);
1574 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1575 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1576 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1577 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1578 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1580 src += 4*sizeof(unsigned char[4]);
1586 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1587 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1593 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1595 float *end = dst + 4*size;
1596 __m128 v = _mm_loadu_ps(src);
1599 _mm_store_ps(dst, v);
1605 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1608 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1609 __m128 m0, m1, m2, m3;
1611 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1613 // fast case for identity matrix
1614 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1617 end = out4f + numitems*4;
1618 m0 = _mm_loadu_ps(inmatrix16f);
1619 m1 = _mm_loadu_ps(inmatrix16f + 4);
1620 m2 = _mm_loadu_ps(inmatrix16f + 8);
1621 m3 = _mm_loadu_ps(inmatrix16f + 12);
1622 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1626 __m128 v = _mm_loadu_ps(in4f);
1628 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1629 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1630 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1631 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1640 __m128 v = _mm_load_ps(in4f);
1642 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1643 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1644 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1645 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1653 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1655 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1659 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1661 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1662 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1663 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1664 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1667 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1669 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1670 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1671 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1672 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1675 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1678 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1679 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1680 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1681 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1684 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1686 int clipmask = 0xFF;
1687 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1688 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1689 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1690 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1691 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1692 #define BBFRONT(k, pos) \
1694 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1695 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1696 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1699 clipmask &= ~(1<<k); \
1700 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1701 minproj = _mm_min_ss(minproj, proj); \
1702 maxproj = _mm_max_ss(maxproj, proj); \
1706 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1707 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1708 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1709 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1710 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1711 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1715 if (clipmask&(1<<k)) \
1717 if (!(clipmask&(1<<(k^1)))) \
1719 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1720 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1721 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1722 minproj = _mm_min_ss(minproj, proj); \
1723 maxproj = _mm_max_ss(maxproj, proj); \
1725 if (!(clipmask&(1<<(k^2)))) \
1727 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1728 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1729 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1730 minproj = _mm_min_ss(minproj, proj); \
1731 maxproj = _mm_max_ss(maxproj, proj); \
1733 if (!(clipmask&(1<<(k^4)))) \
1735 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1736 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1737 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1738 minproj = _mm_min_ss(minproj, proj); \
1739 maxproj = _mm_max_ss(maxproj, proj); \
1743 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1744 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1745 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1746 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1747 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1748 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1749 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1750 *starty = _mm_cvttss_si32(maxproj);
1751 *endy = _mm_cvttss_si32(minproj)+1;
1755 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1757 float *end = out4f + numitems*4;
1758 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1759 __m128 minpos, maxpos;
1760 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1762 minpos = maxpos = _mm_loadu_ps(in4f);
1765 __m128 v = _mm_loadu_ps(in4f);
1766 minpos = _mm_min_ps(minpos, v);
1767 maxpos = _mm_max_ps(maxpos, v);
1768 _mm_store_ps(out4f, v);
1769 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1770 _mm_store_ps(screen4f, v);
1778 minpos = maxpos = _mm_load_ps(in4f);
1781 __m128 v = _mm_load_ps(in4f);
1782 minpos = _mm_min_ps(minpos, v);
1783 maxpos = _mm_max_ps(maxpos, v);
1784 _mm_store_ps(out4f, v);
1785 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1786 _mm_store_ps(screen4f, v);
1793 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale,
1794 _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1795 _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1796 _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1797 _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1801 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1803 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1804 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1806 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1807 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1808 end = out4f + numitems*4;
1809 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1810 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1811 m0 = _mm_loadu_ps(inmatrix16f);
1812 m1 = _mm_loadu_ps(inmatrix16f + 4);
1813 m2 = _mm_loadu_ps(inmatrix16f + 8);
1814 m3 = _mm_loadu_ps(inmatrix16f + 12);
1815 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1817 minpos = maxpos = _mm_loadu_ps(in4f);
1820 __m128 v = _mm_loadu_ps(in4f);
1821 minpos = _mm_min_ps(minpos, v);
1822 maxpos = _mm_max_ps(maxpos, v);
1823 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1824 _mm_store_ps(out4f, v);
1825 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1826 _mm_store_ps(screen4f, v);
1834 minpos = maxpos = _mm_load_ps(in4f);
1837 __m128 v = _mm_load_ps(in4f);
1838 minpos = _mm_min_ps(minpos, v);
1839 maxpos = _mm_max_ps(maxpos, v);
1840 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1841 _mm_store_ps(out4f, v);
1842 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1843 _mm_store_ps(screen4f, v);
1850 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3);
1855 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1858 float *outf = dpsoftrast.post_array4f[outarray];
1859 const unsigned char *inb;
1860 int firstvertex = dpsoftrast.firstvertex;
1861 int numvertices = dpsoftrast.numvertices;
1865 case DPSOFTRAST_ARRAY_POSITION:
1866 stride = dpsoftrast.stride_vertex;
1867 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1868 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1870 case DPSOFTRAST_ARRAY_COLOR:
1871 stride = dpsoftrast.stride_color;
1872 if (dpsoftrast.pointer_color4f)
1874 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1875 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1877 else if (dpsoftrast.pointer_color4ub)
1879 stride = dpsoftrast.stride_color;
1880 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1881 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1885 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1889 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1890 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1892 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1893 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1896 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1899 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1902 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1914 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1916 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1917 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1922 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1925 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1926 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1934 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1937 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1938 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1945 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1948 int startx = span->startx;
1949 int endx = span->endx;
1950 float wslope = triangle->w[0];
1951 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1952 float endz = 1.0f / (w + wslope * startx);
1953 for (x = startx;x < endx;)
1955 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1957 if (nextsub >= endx) nextsub = endsub = endx-1;
1958 endz = 1.0f / (w + wslope * nextsub);
1959 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1960 for (; x <= endsub; x++, z += dz)
1965 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1968 int startx = span->startx;
1969 int endx = span->endx;
1972 unsigned char * RESTRICT pixelmask = span->pixelmask;
1973 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1976 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1977 // handle alphatest now (this affects depth writes too)
1978 if (thread->alphatest)
1979 for (x = startx;x < endx;x++)
1980 if (in4f[x*4+3] < 0.5f)
1981 pixelmask[x] = false;
1982 // FIXME: this does not handle bigendian
1983 switch(thread->fb_blendmode)
1985 case DPSOFTRAST_BLENDMODE_OPAQUE:
1986 for (x = startx;x < endx;x++)
1990 d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
1991 d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
1992 d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
1993 d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
1994 pixel[x*4+0] = d[0];
1995 pixel[x*4+1] = d[1];
1996 pixel[x*4+2] = d[2];
1997 pixel[x*4+3] = d[3];
2000 case DPSOFTRAST_BLENDMODE_ALPHA:
2001 for (x = startx;x < endx;x++)
2005 a = in4f[x*4+3] * 255.0f;
2006 b = 1.0f - in4f[x*4+3];
2007 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2008 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2009 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2010 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2011 pixel[x*4+0] = d[0];
2012 pixel[x*4+1] = d[1];
2013 pixel[x*4+2] = d[2];
2014 pixel[x*4+3] = d[3];
2017 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2018 for (x = startx;x < endx;x++)
2022 a = in4f[x*4+3] * 255.0f;
2023 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2024 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2025 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2026 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2027 pixel[x*4+0] = d[0];
2028 pixel[x*4+1] = d[1];
2029 pixel[x*4+2] = d[2];
2030 pixel[x*4+3] = d[3];
2033 case DPSOFTRAST_BLENDMODE_ADD:
2034 for (x = startx;x < endx;x++)
2038 d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2039 d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2040 d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2041 d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2042 pixel[x*4+0] = d[0];
2043 pixel[x*4+1] = d[1];
2044 pixel[x*4+2] = d[2];
2045 pixel[x*4+3] = d[3];
2048 case DPSOFTRAST_BLENDMODE_INVMOD:
2049 for (x = startx;x < endx;x++)
2053 d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2054 d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2055 d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2056 d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2057 pixel[x*4+0] = d[0];
2058 pixel[x*4+1] = d[1];
2059 pixel[x*4+2] = d[2];
2060 pixel[x*4+3] = d[3];
2063 case DPSOFTRAST_BLENDMODE_MUL:
2064 for (x = startx;x < endx;x++)
2068 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2069 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2070 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2071 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2072 pixel[x*4+0] = d[0];
2073 pixel[x*4+1] = d[1];
2074 pixel[x*4+2] = d[2];
2075 pixel[x*4+3] = d[3];
2078 case DPSOFTRAST_BLENDMODE_MUL2:
2079 for (x = startx;x < endx;x++)
2083 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2084 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2085 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2086 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2087 pixel[x*4+0] = d[0];
2088 pixel[x*4+1] = d[1];
2089 pixel[x*4+2] = d[2];
2090 pixel[x*4+3] = d[3];
2093 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2094 for (x = startx;x < endx;x++)
2098 a = in4f[x*4+3] * -255.0f;
2099 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2100 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2101 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2102 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2103 pixel[x*4+0] = d[0];
2104 pixel[x*4+1] = d[1];
2105 pixel[x*4+2] = d[2];
2106 pixel[x*4+3] = d[3];
2109 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2110 for (x = startx;x < endx;x++)
2115 b = 1.0f - in4f[x*4+3];
2116 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2117 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2118 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2119 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2120 pixel[x*4+0] = d[0];
2121 pixel[x*4+1] = d[1];
2122 pixel[x*4+2] = d[2];
2123 pixel[x*4+3] = d[3];
2126 case DPSOFTRAST_BLENDMODE_INVADD:
2127 for (x = startx;x < endx;x++)
2131 d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2132 d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2133 d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2134 d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2135 pixel[x*4+0] = d[0];
2136 pixel[x*4+1] = d[1];
2137 pixel[x*4+2] = d[2];
2138 pixel[x*4+3] = d[3];
2144 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2148 int startx = span->startx;
2149 int endx = span->endx;
2150 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2151 unsigned char * RESTRICT pixelmask = span->pixelmask;
2152 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2153 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2156 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2157 pixeli += span->y * dpsoftrast.fb_width + span->x;
2158 // handle alphatest now (this affects depth writes too)
2159 if (thread->alphatest)
2160 for (x = startx;x < endx;x++)
2161 if (in4ub[x*4+3] < 0.5f)
2162 pixelmask[x] = false;
2163 // FIXME: this does not handle bigendian
2164 switch(thread->fb_blendmode)
2166 case DPSOFTRAST_BLENDMODE_OPAQUE:
2167 for (x = startx;x + 4 <= endx;)
2169 if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2171 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2185 case DPSOFTRAST_BLENDMODE_ALPHA:
2186 #define FINISHBLEND(blend2, blend1) \
2187 for (x = startx;x + 1 < endx;x += 2) \
2190 switch (*(const unsigned short*)&pixelmask[x]) \
2193 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2194 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2196 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2199 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2200 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2202 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2205 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2206 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2208 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2213 for(;x < endx; x++) \
2216 if (!pixelmask[x]) \
2218 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2219 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2221 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2225 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2226 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2228 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2229 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2232 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2234 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2235 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2237 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2238 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2241 case DPSOFTRAST_BLENDMODE_ADD:
2242 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2244 case DPSOFTRAST_BLENDMODE_INVMOD:
2246 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2248 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2251 case DPSOFTRAST_BLENDMODE_MUL:
2252 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2254 case DPSOFTRAST_BLENDMODE_MUL2:
2255 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2257 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2259 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2260 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2262 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2263 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2266 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2268 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2269 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2271 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2272 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2275 case DPSOFTRAST_BLENDMODE_INVADD:
2277 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2279 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2286 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2289 int startx = span->startx;
2290 int endx = span->endx;
2295 float tc[2], endtc[2];
2297 unsigned int tci[2];
2298 unsigned int tci1[2];
2299 unsigned int tcimin[2];
2300 unsigned int tcimax[2];
2305 const unsigned char * RESTRICT pixelbase;
2306 const unsigned char * RESTRICT pixel[4];
2307 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2308 // if no texture is bound, just fill it with white
2311 for (x = startx;x < endx;x++)
2313 out4f[x*4+0] = 1.0f;
2314 out4f[x*4+1] = 1.0f;
2315 out4f[x*4+2] = 1.0f;
2316 out4f[x*4+3] = 1.0f;
2320 mip = triangle->mip[texunitindex];
2321 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2322 // if this mipmap of the texture is 1 pixel, just fill it with that color
2323 if (texture->mipmap[mip][1] == 4)
2325 c[0] = texture->bytes[2] * (1.0f/255.0f);
2326 c[1] = texture->bytes[1] * (1.0f/255.0f);
2327 c[2] = texture->bytes[0] * (1.0f/255.0f);
2328 c[3] = texture->bytes[3] * (1.0f/255.0f);
2329 for (x = startx;x < endx;x++)
2331 out4f[x*4+0] = c[0];
2332 out4f[x*4+1] = c[1];
2333 out4f[x*4+2] = c[2];
2334 out4f[x*4+3] = c[3];
2338 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2339 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2340 flags = texture->flags;
2341 tcscale[0] = texture->mipmap[mip][2];
2342 tcscale[1] = texture->mipmap[mip][3];
2343 tciwidth = texture->mipmap[mip][2];
2346 tcimax[0] = texture->mipmap[mip][2]-1;
2347 tcimax[1] = texture->mipmap[mip][3]-1;
2348 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2349 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2350 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2351 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2352 for (x = startx;x < endx;)
2354 unsigned int subtc[2];
2355 unsigned int substep[2];
2356 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2357 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2358 if (nextsub >= endx)
2360 nextsub = endsub = endx-1;
2361 if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2365 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2366 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2367 substep[0] = (endtc[0] - tc[0]) * subscale;
2368 substep[1] = (endtc[1] - tc[1]) * subscale;
2369 subtc[0] = tc[0] * (1<<16);
2370 subtc[1] = tc[1] * (1<<16);
2373 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2375 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2377 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2378 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2379 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2380 tci[0] = subtc[0]>>16;
2381 tci[1] = subtc[1]>>16;
2382 tci1[0] = tci[0] + 1;
2383 tci1[1] = tci[1] + 1;
2384 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2385 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2386 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2387 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2388 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2389 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2390 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2391 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2392 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2393 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2394 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2395 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2396 out4f[x*4+0] = c[0];
2397 out4f[x*4+1] = c[1];
2398 out4f[x*4+2] = c[2];
2399 out4f[x*4+3] = c[3];
2404 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2406 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2407 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2408 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2409 tci[0] = subtc[0]>>16;
2410 tci[1] = subtc[1]>>16;
2411 tci1[0] = tci[0] + 1;
2412 tci1[1] = tci[1] + 1;
2413 tci[0] &= tciwrapmask[0];
2414 tci[1] &= tciwrapmask[1];
2415 tci1[0] &= tciwrapmask[0];
2416 tci1[1] &= tciwrapmask[1];
2417 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2418 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2419 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2420 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2421 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2422 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2423 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2424 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2425 out4f[x*4+0] = c[0];
2426 out4f[x*4+1] = c[1];
2427 out4f[x*4+2] = c[2];
2428 out4f[x*4+3] = c[3];
2432 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2434 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2436 tci[0] = subtc[0]>>16;
2437 tci[1] = subtc[1]>>16;
2438 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2439 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2440 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2441 c[0] = pixel[0][2] * (1.0f / 255.0f);
2442 c[1] = pixel[0][1] * (1.0f / 255.0f);
2443 c[2] = pixel[0][0] * (1.0f / 255.0f);
2444 c[3] = pixel[0][3] * (1.0f / 255.0f);
2445 out4f[x*4+0] = c[0];
2446 out4f[x*4+1] = c[1];
2447 out4f[x*4+2] = c[2];
2448 out4f[x*4+3] = c[3];
2453 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2455 tci[0] = subtc[0]>>16;
2456 tci[1] = subtc[1]>>16;
2457 tci[0] &= tciwrapmask[0];
2458 tci[1] &= tciwrapmask[1];
2459 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2460 c[0] = pixel[0][2] * (1.0f / 255.0f);
2461 c[1] = pixel[0][1] * (1.0f / 255.0f);
2462 c[2] = pixel[0][0] * (1.0f / 255.0f);
2463 c[3] = pixel[0][3] * (1.0f / 255.0f);
2464 out4f[x*4+0] = c[0];
2465 out4f[x*4+1] = c[1];
2466 out4f[x*4+2] = c[2];
2467 out4f[x*4+3] = c[3];
2473 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2477 int startx = span->startx;
2478 int endx = span->endx;
2480 __m128 data, slope, tcscale;
2481 __m128i tcsize, tcmask, tcoffset, tcmax;
2483 __m128i subtc, substep, endsubtc;
2486 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2487 const unsigned char * RESTRICT pixelbase;
2488 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2489 // if no texture is bound, just fill it with white
2492 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2495 mip = triangle->mip[texunitindex];
2496 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2497 // if this mipmap of the texture is 1 pixel, just fill it with that color
2498 if (texture->mipmap[mip][1] == 4)
2500 unsigned int k = *((const unsigned int *)pixelbase);
2501 for (x = startx;x < endx;x++)
2505 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2506 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2507 flags = texture->flags;
2508 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2509 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2510 tcscale = _mm_cvtepi32_ps(tcsize);
2511 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2512 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2513 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2514 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2515 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2516 tcmax = _mm_packs_epi32(tcmask, tcmask);
2517 for (x = startx;x < endx;)
2519 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2520 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2521 if (nextsub >= endx)
2523 nextsub = endsub = endx-1;
2524 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2528 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2529 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2530 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2531 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2532 substep = _mm_slli_epi32(substep, 1);
2535 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2536 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2538 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2539 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2541 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2542 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2543 tci = _mm_madd_epi16(tci, tcoffset);
2544 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2545 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2546 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2547 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2548 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2549 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2550 fracm = _mm_srli_epi16(subtc, 1);
2551 pix1 = _mm_add_epi16(pix1,
2552 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2553 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2554 pix3 = _mm_add_epi16(pix3,
2555 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2556 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2557 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2558 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2559 pix2 = _mm_add_epi16(pix2,
2560 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2561 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2562 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2566 const unsigned char * RESTRICT ptr1;
2567 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2568 tci = _mm_madd_epi16(tci, tcoffset);
2569 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2570 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2571 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2572 fracm = _mm_srli_epi16(subtc, 1);
2573 pix1 = _mm_add_epi16(pix1,
2574 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2575 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2576 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2577 pix1 = _mm_add_epi16(pix1,
2578 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2579 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2580 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2584 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2586 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2588 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2589 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2590 tci = _mm_madd_epi16(tci, tcoffset);
2591 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2592 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2593 _mm_setzero_si128());
2594 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2595 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2596 _mm_setzero_si128());
2597 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2598 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2599 tci = _mm_madd_epi16(tci, tcoffset);
2600 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2601 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2602 _mm_setzero_si128());
2603 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2604 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2605 _mm_setzero_si128());
2606 fracm = _mm_srli_epi16(subtc, 1);
2607 pix1 = _mm_add_epi16(pix1,
2608 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2609 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2610 pix3 = _mm_add_epi16(pix3,
2611 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2612 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2613 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2614 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2615 pix2 = _mm_add_epi16(pix2,
2616 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2617 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2618 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2622 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2623 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2624 tci = _mm_madd_epi16(tci, tcoffset);
2625 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2626 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2627 _mm_setzero_si128());
2628 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2629 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2630 _mm_setzero_si128());
2631 fracm = _mm_srli_epi16(subtc, 1);
2632 pix1 = _mm_add_epi16(pix1,
2633 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2634 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2635 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2636 pix1 = _mm_add_epi16(pix1,
2637 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2638 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2639 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2645 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2647 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2648 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2649 tci = _mm_madd_epi16(tci, tcoffset);
2650 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2651 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2652 _mm_setzero_si128());
2653 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2654 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2655 _mm_setzero_si128());
2656 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2657 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2658 tci = _mm_madd_epi16(tci, tcoffset);
2659 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2660 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2661 _mm_setzero_si128());
2662 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2663 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2664 _mm_setzero_si128());
2665 fracm = _mm_srli_epi16(subtc, 1);
2666 pix1 = _mm_add_epi16(pix1,
2667 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2668 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2669 pix3 = _mm_add_epi16(pix3,
2670 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2671 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2672 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2673 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2674 pix2 = _mm_add_epi16(pix2,
2675 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2676 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2677 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2681 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2682 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2683 tci = _mm_madd_epi16(tci, tcoffset);
2684 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2685 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2686 _mm_setzero_si128());
2687 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2688 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2689 _mm_setzero_si128());
2690 fracm = _mm_srli_epi16(subtc, 1);
2691 pix1 = _mm_add_epi16(pix1,
2692 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2693 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2694 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2695 pix1 = _mm_add_epi16(pix1,
2696 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2697 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2698 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2705 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2707 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2709 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2710 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2711 tci = _mm_madd_epi16(tci, tcoffset);
2712 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2713 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2717 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2718 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2719 tci = _mm_madd_epi16(tci, tcoffset);
2720 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2726 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2728 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2729 tci = _mm_and_si128(tci, tcmax);
2730 tci = _mm_madd_epi16(tci, tcoffset);
2731 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2732 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2736 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2737 tci = _mm_and_si128(tci, tcmax);
2738 tci = _mm_madd_epi16(tci, tcoffset);
2739 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2748 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2751 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2754 float DPSOFTRAST_SampleShadowmap(const float *vector)
2760 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2763 int startx = span->startx;
2764 int endx = span->endx;
2769 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2770 for (x = startx;x < endx;x++)
2773 c[0] = (data[0] + slope[0]*x) * z;
2774 c[1] = (data[1] + slope[1]*x) * z;
2775 c[2] = (data[2] + slope[2]*x) * z;
2776 c[3] = (data[3] + slope[3]*x) * z;
2777 out4f[x*4+0] = in4f[x*4+0] * c[0];
2778 out4f[x*4+1] = in4f[x*4+1] * c[1];
2779 out4f[x*4+2] = in4f[x*4+2] * c[2];
2780 out4f[x*4+3] = in4f[x*4+3] * c[3];
2784 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2787 int startx = span->startx;
2788 int endx = span->endx;
2793 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2794 for (x = startx;x < endx;x++)
2797 c[0] = (data[0] + slope[0]*x) * z;
2798 c[1] = (data[1] + slope[1]*x) * z;
2799 c[2] = (data[2] + slope[2]*x) * z;
2800 c[3] = (data[3] + slope[3]*x) * z;
2801 out4f[x*4+0] = c[0];
2802 out4f[x*4+1] = c[1];
2803 out4f[x*4+2] = c[2];
2804 out4f[x*4+3] = c[3];
2808 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2810 int x, startx = span->startx, endx = span->endx;
2811 float c[4], localcolor[4];
2812 localcolor[0] = subcolor[0];
2813 localcolor[1] = subcolor[1];
2814 localcolor[2] = subcolor[2];
2815 localcolor[3] = subcolor[3];
2816 for (x = startx;x < endx;x++)
2818 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2819 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2820 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2821 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2822 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2823 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2824 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2825 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2829 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2831 int x, startx = span->startx, endx = span->endx;
2832 for (x = startx;x < endx;x++)
2834 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2835 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2836 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2837 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2841 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2843 int x, startx = span->startx, endx = span->endx;
2844 for (x = startx;x < endx;x++)
2846 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2847 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2848 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2849 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2853 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2855 int x, startx = span->startx, endx = span->endx;
2857 for (x = startx;x < endx;x++)
2859 a = 1.0f - inb4f[x*4+3];
2861 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2862 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2863 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2864 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2868 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2870 int x, startx = span->startx, endx = span->endx;
2871 float localcolor[4], ilerp, lerp;
2872 localcolor[0] = color[0];
2873 localcolor[1] = color[1];
2874 localcolor[2] = color[2];
2875 localcolor[3] = color[3];
2876 ilerp = 1.0f - localcolor[3];
2877 lerp = localcolor[3];
2878 for (x = startx;x < endx;x++)
2880 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2881 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2882 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2883 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2889 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2893 int startx = span->startx;
2894 int endx = span->endx;
2897 __m128i submod, substep, endsubmod;
2898 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2899 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2900 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2901 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2902 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2903 for (x = startx; x < endx;)
2905 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2906 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2907 if (nextsub >= endx)
2909 nextsub = endsub = endx-1;
2910 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2914 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2915 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2916 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2917 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2918 substep = _mm_packs_epi32(substep, substep);
2919 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2921 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2922 pix = _mm_mulhi_epu16(pix, submod);
2923 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2927 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2928 pix = _mm_mulhi_epu16(pix, submod);
2929 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2936 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2940 int startx = span->startx;
2941 int endx = span->endx;
2944 __m128i submod, substep, endsubmod;
2945 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2946 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2947 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2948 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2949 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2950 for (x = startx; x < endx;)
2952 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2953 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2954 if (nextsub >= endx)
2956 nextsub = endsub = endx-1;
2957 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2961 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2962 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2963 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2964 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2965 substep = _mm_packs_epi32(substep, substep);
2966 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2968 __m128i pix = _mm_srai_epi16(submod, 4);
2969 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2973 __m128i pix = _mm_srai_epi16(submod, 4);
2974 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2981 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2984 int x, startx = span->startx, endx = span->endx;
2985 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2986 localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2987 for (x = startx;x+2 <= endx;x+=2)
2989 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2990 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2991 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2992 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2996 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2997 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2998 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2999 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3004 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3007 int x, startx = span->startx, endx = span->endx;
3008 for (x = startx;x+2 <= endx;x+=2)
3010 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3011 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3012 pix1 = _mm_mulhi_epu16(pix1, pix2);
3013 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3017 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3018 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3019 pix1 = _mm_mulhi_epu16(pix1, pix2);
3020 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3025 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3028 int x, startx = span->startx, endx = span->endx;
3029 for (x = startx;x+2 <= endx;x+=2)
3031 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3032 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3033 pix1 = _mm_add_epi16(pix1, pix2);
3034 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3038 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3039 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3040 pix1 = _mm_add_epi16(pix1, pix2);
3041 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3046 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3049 int x, startx = span->startx, endx = span->endx;
3050 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3051 tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
3052 for (x = startx;x+2 <= endx;x+=2)
3054 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3055 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3056 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3057 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3061 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3062 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3063 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3064 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3069 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3072 int x, startx = span->startx, endx = span->endx;
3073 for (x = startx;x+2 <= endx;x+=2)
3075 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3076 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3077 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3078 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3079 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3083 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3084 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3085 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3086 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3087 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3092 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3095 int x, startx = span->startx, endx = span->endx;
3096 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3097 localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
3098 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3099 for (x = startx;x+2 <= endx;x+=2)
3101 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3102 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3103 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3107 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3108 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3109 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3116 void DPSOFTRAST_VertexShader_Generic(void)
3118 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3119 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3120 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3121 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3122 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3125 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3127 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3128 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3129 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3130 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3131 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3132 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3134 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3135 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3136 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3138 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3139 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3142 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3144 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3147 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3149 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3152 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3157 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3158 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3163 void DPSOFTRAST_VertexShader_PostProcess(void)
3165 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3166 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3167 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3170 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3172 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3173 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3174 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3175 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3176 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3177 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3178 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3180 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3181 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3183 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3184 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3186 // TODO: implement saturation
3188 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3190 // TODO: implement gammaramps
3192 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3197 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3199 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3202 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3204 // this is never called (because colormask is off when this shader is used)
3205 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3206 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3207 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3208 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3209 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3214 void DPSOFTRAST_VertexShader_FlatColor(void)
3216 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3217 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3220 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3223 unsigned char * RESTRICT pixelmask = span->pixelmask;
3224 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3225 int x, startx = span->startx, endx = span->endx;
3226 __m128i Color_Ambientm;
3227 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3228 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3229 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3230 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3231 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3232 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3233 pixel = buffer_FragColorbgra8;
3234 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3235 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3236 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3237 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3238 for (x = startx;x < endx;x++)
3241 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3244 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3245 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3246 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3247 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3253 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3254 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3255 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3257 if (pixel == buffer_FragColorbgra8)
3258 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3264 void DPSOFTRAST_VertexShader_VertexColor(void)
3266 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3267 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3268 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3271 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3274 unsigned char * RESTRICT pixelmask = span->pixelmask;
3275 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3276 int x, startx = span->startx, endx = span->endx;
3277 __m128i Color_Ambientm, Color_Diffusem;
3279 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3280 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3281 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3282 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3283 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3284 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3285 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3286 pixel = buffer_FragColorbgra8;
3287 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3288 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3289 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3290 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3291 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3292 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3293 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3294 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3295 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3296 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3297 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3298 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3299 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3300 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3302 __m128i color, mod, pix;
3303 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3306 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3307 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3308 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3309 data = _mm_add_ps(data, slope);
3310 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3311 data = _mm_add_ps(data, slope);
3312 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3313 data = _mm_add_ps(data, slope);
3314 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3315 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3316 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3317 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3318 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3319 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3325 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3326 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3327 mod = _mm_packs_epi32(mod, mod);
3328 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3329 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3331 if (pixel == buffer_FragColorbgra8)
3332 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3338 void DPSOFTRAST_VertexShader_Lightmap(void)
3340 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3341 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3342 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3345 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3348 unsigned char * RESTRICT pixelmask = span->pixelmask;
3349 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3350 int x, startx = span->startx, endx = span->endx;
3351 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3352 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3353 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3354 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3355 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3356 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3357 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3358 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3359 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3360 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3361 pixel = buffer_FragColorbgra8;
3362 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3363 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3364 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3365 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3366 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3367 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3368 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3369 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3371 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3372 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3373 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3374 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3375 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3376 for (x = startx;x < endx;x++)
3378 __m128i color, lightmap, glow, pix;
3379 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3382 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3383 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3384 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3385 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3386 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3387 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3388 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3389 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3390 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3391 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3397 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3398 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3399 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3400 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3401 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3402 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3407 for (x = startx;x < endx;x++)
3409 __m128i color, lightmap, pix;
3410 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3413 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3414 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3415 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3416 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3417 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3418 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3419 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3425 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3426 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3427 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3428 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3431 if (pixel == buffer_FragColorbgra8)
3432 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3438 void DPSOFTRAST_VertexShader_FakeLight(void)
3440 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3443 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3446 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3447 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3448 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3449 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3450 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3455 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3457 DPSOFTRAST_VertexShader_Lightmap();
3460 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3462 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3468 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3470 DPSOFTRAST_VertexShader_Lightmap();
3473 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3475 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3481 void DPSOFTRAST_VertexShader_LightDirection(void)
3484 int numvertices = dpsoftrast.numvertices;
3486 float LightVector[4];
3487 float EyePosition[4];
3488 float EyeVectorModelSpace[4];
3494 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3495 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3496 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3497 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3498 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3499 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3500 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3501 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3502 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3503 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3504 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3505 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3506 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3507 for (i = 0;i < numvertices;i++)
3509 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3510 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3511 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3512 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3513 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3514 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3515 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3516 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3517 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3518 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3519 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3520 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3521 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3522 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3523 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3524 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3525 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3526 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3527 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3528 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3529 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3530 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3531 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3532 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3533 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3534 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3535 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3536 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3537 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3539 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3542 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3543 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3544 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3545 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3546 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3547 #define DPSOFTRAST_Vector3Normalize(v)\
3550 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3561 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3563 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3564 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3565 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3566 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3567 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3568 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3569 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3570 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3571 int x, startx = span->startx, endx = span->endx;
3572 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3573 float LightVectordata[4];
3574 float LightVectorslope[4];
3575 float EyeVectordata[4];
3576 float EyeVectorslope[4];
3578 float diffusetex[4];
3580 float surfacenormal[4];
3581 float lightnormal[4];
3583 float specularnormal[4];
3586 float SpecularPower;
3588 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3589 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3590 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3591 Color_Glow[3] = 0.0f;
3592 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3593 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3594 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3595 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3596 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3597 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3598 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3599 Color_Pants[3] = 0.0f;
3600 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3601 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3602 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3603 Color_Shirt[3] = 0.0f;
3604 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3605 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3606 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3608 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3609 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3611 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3613 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3615 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3617 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3618 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3619 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3620 Color_Diffuse[3] = 0.0f;
3621 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3622 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3623 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3624 LightColor[3] = 0.0f;
3625 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3626 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3627 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3628 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3629 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3630 Color_Specular[3] = 0.0f;
3631 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3632 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3633 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3634 for (x = startx;x < endx;x++)
3637 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3638 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3639 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3640 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3641 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3643 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3644 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3645 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3646 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3648 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3649 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3650 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3651 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3652 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3653 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3654 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3655 DPSOFTRAST_Vector3Normalize(surfacenormal);
3657 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3658 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3659 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3660 DPSOFTRAST_Vector3Normalize(lightnormal);
3662 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3663 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3664 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3665 DPSOFTRAST_Vector3Normalize(eyenormal);
3667 specularnormal[0] = lightnormal[0] + eyenormal[0];
3668 specularnormal[1] = lightnormal[1] + eyenormal[1];
3669 specularnormal[2] = lightnormal[2] + eyenormal[2];
3670 DPSOFTRAST_Vector3Normalize(specularnormal);
3672 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3673 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3674 specular = pow(specular, SpecularPower * glosstex[3]);
3675 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3677 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3678 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3679 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3680 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3684 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3685 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3686 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3687 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3689 buffer_FragColorbgra8[x*4+0] = d[0];
3690 buffer_FragColorbgra8[x*4+1] = d[1];
3691 buffer_FragColorbgra8[x*4+2] = d[2];
3692 buffer_FragColorbgra8[x*4+3] = d[3];
3695 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3697 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3698 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3699 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3700 Color_Diffuse[3] = 0.0f;
3701 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3702 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3703 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3704 LightColor[3] = 0.0f;
3705 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3706 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3707 for (x = startx;x < endx;x++)
3710 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3711 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3712 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3713 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3714 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3715 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3716 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3717 DPSOFTRAST_Vector3Normalize(surfacenormal);
3719 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3720 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3721 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3722 DPSOFTRAST_Vector3Normalize(lightnormal);
3724 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3725 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3727 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3728 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3729 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3730 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3734 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3735 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3736 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3737 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3739 buffer_FragColorbgra8[x*4+0] = d[0];
3740 buffer_FragColorbgra8[x*4+1] = d[1];
3741 buffer_FragColorbgra8[x*4+2] = d[2];
3742 buffer_FragColorbgra8[x*4+3] = d[3];
3747 for (x = startx;x < endx;x++)
3750 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3751 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3752 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3753 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3755 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3757 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3758 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3759 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3760 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3764 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3765 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3766 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3767 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3769 buffer_FragColorbgra8[x*4+0] = d[0];
3770 buffer_FragColorbgra8[x*4+1] = d[1];
3771 buffer_FragColorbgra8[x*4+2] = d[2];
3772 buffer_FragColorbgra8[x*4+3] = d[3];
3775 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3780 void DPSOFTRAST_VertexShader_LightSource(void)
3783 int numvertices = dpsoftrast.numvertices;
3784 float LightPosition[4];
3785 float LightVector[4];
3786 float LightVectorModelSpace[4];
3787 float EyePosition[4];
3788 float EyeVectorModelSpace[4];
3794 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3795 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3796 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3797 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3798 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3799 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3800 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3801 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3802 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3803 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3804 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3805 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3806 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3807 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3808 for (i = 0;i < numvertices;i++)
3810 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3811 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3812 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3813 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3814 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3815 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3816 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3817 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3818 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3819 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3820 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3821 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3822 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3823 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3824 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3825 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3826 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3827 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
3828 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3829 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3830 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3831 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3832 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3833 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3834 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3835 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3836 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3837 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3838 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3839 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3840 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3841 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3843 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3844 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3847 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3850 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3851 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3852 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3853 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3854 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3855 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3856 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3857 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3858 int x, startx = span->startx, endx = span->endx;
3859 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3860 float CubeVectordata[4];
3861 float CubeVectorslope[4];
3862 float LightVectordata[4];
3863 float LightVectorslope[4];
3864 float EyeVectordata[4];
3865 float EyeVectorslope[4];
3867 float diffusetex[4];
3869 float surfacenormal[4];
3870 float lightnormal[4];
3872 float specularnormal[4];
3875 float SpecularPower;
3876 float CubeVector[4];
3879 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3880 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3881 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3882 Color_Glow[3] = 0.0f;
3883 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3884 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3885 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3886 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3887 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3888 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3889 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3890 Color_Diffuse[3] = 0.0f;
3891 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3892 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3893 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3894 Color_Specular[3] = 0.0f;
3895 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3896 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3897 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3898 Color_Pants[3] = 0.0f;
3899 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3900 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3901 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3902 Color_Shirt[3] = 0.0f;
3903 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3904 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3905 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3906 LightColor[3] = 0.0f;
3907 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3908 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3909 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3910 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3911 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3912 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
3913 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3914 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3916 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3917 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3919 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3920 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
3921 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3923 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3924 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3925 for (x = startx;x < endx;x++)
3928 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3929 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3930 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3931 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3932 if (attenuation < 0.01f)
3934 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3936 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3937 if (attenuation < 0.01f)
3941 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3942 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3943 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3944 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3945 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3947 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3948 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3949 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3950 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3952 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3953 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3954 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3955 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3956 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3957 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3958 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3959 DPSOFTRAST_Vector3Normalize(surfacenormal);
3961 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3962 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3963 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3964 DPSOFTRAST_Vector3Normalize(lightnormal);
3966 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3967 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3968 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3969 DPSOFTRAST_Vector3Normalize(eyenormal);
3971 specularnormal[0] = lightnormal[0] + eyenormal[0];
3972 specularnormal[1] = lightnormal[1] + eyenormal[1];
3973 specularnormal[2] = lightnormal[2] + eyenormal[2];
3974 DPSOFTRAST_Vector3Normalize(specularnormal);
3976 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3977 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3978 specular = pow(specular, SpecularPower * glosstex[3]);
3979 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3981 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3982 attenuation *= (1.0f / 255.0f);
3983 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3984 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3985 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3986 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3990 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
3991 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
3992 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
3993 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3995 buffer_FragColorbgra8[x*4+0] = d[0];
3996 buffer_FragColorbgra8[x*4+1] = d[1];
3997 buffer_FragColorbgra8[x*4+2] = d[2];
3998 buffer_FragColorbgra8[x*4+3] = d[3];
4001 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4003 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4004 for (x = startx;x < endx;x++)
4007 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4008 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4009 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4010 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4011 if (attenuation < 0.01f)
4013 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4015 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4016 if (attenuation < 0.01f)
4020 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4021 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4022 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4023 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4024 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4026 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4027 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4028 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4029 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4031 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4032 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4033 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4034 DPSOFTRAST_Vector3Normalize(surfacenormal);
4036 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4037 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4038 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4039 DPSOFTRAST_Vector3Normalize(lightnormal);
4041 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4042 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4044 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4045 attenuation *= (1.0f / 255.0f);
4046 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4047 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4048 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4049 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4053 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4054 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4055 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4056 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4058 buffer_FragColorbgra8[x*4+0] = d[0];
4059 buffer_FragColorbgra8[x*4+1] = d[1];
4060 buffer_FragColorbgra8[x*4+2] = d[2];
4061 buffer_FragColorbgra8[x*4+3] = d[3];
4066 for (x = startx;x < endx;x++)
4069 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4070 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4071 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4072 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4073 if (attenuation < 0.01f)
4075 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4077 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4078 if (attenuation < 0.01f)
4082 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4083 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4084 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4085 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4086 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4088 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4089 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4090 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4091 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4093 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4095 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4096 attenuation *= (1.0f / 255.0f);
4097 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4098 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4099 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4100 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4104 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4105 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4106 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4107 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4109 buffer_FragColorbgra8[x*4+0] = d[0];
4110 buffer_FragColorbgra8[x*4+1] = d[1];
4111 buffer_FragColorbgra8[x*4+2] = d[2];
4112 buffer_FragColorbgra8[x*4+3] = d[3];
4115 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4121 void DPSOFTRAST_VertexShader_Refraction(void)
4123 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4126 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4129 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4130 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4131 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4132 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4133 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4138 void DPSOFTRAST_VertexShader_Water(void)
4140 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4144 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4147 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4148 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4149 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4150 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4151 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4156 void DPSOFTRAST_VertexShader_ShowDepth(void)
4158 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4161 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4164 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4165 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4166 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4167 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4168 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4173 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4175 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4178 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4181 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4182 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4183 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4184 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4185 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4190 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4192 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4195 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4198 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4199 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4200 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4201 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4202 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4207 void DPSOFTRAST_VertexShader_DeferredBounceLight(void)
4209 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4212 void DPSOFTRAST_PixelShader_DeferredBounceLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4215 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4216 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4217 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4218 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4219 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4224 typedef struct DPSOFTRAST_ShaderModeInfo_s
4227 void (*Vertex)(void);
4228 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4229 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4230 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4232 DPSOFTRAST_ShaderModeInfo;
4234 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4236 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4237 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4238 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4239 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4240 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4241 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4242 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {~0}, {~0}},
4243 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4244 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4245 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4246 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4247 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {~0}},
4248 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4249 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4250 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4251 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4252 {2, DPSOFTRAST_VertexShader_DeferredBounceLight, DPSOFTRAST_PixelShader_DeferredBounceLight, {~0}}
4255 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4262 // unsigned int *colorpixel;
4263 unsigned int *depthpixel;
4269 DPSOFTRAST_State_Triangle *triangle;
4270 DPSOFTRAST_State_Span *span;
4271 unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4272 for (i = 0; i < thread->numspans; i++)
4274 span = &thread->spans[i];
4275 triangle = &thread->triangles[span->triangle];
4276 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4278 wslope = triangle->w[0];
4279 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4280 depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4281 depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4282 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4283 startx = span->startx;
4285 switch(thread->fb_depthfunc)
4288 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4289 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4290 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4291 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4292 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4293 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4294 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4296 //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4297 //for (x = startx;x < endx;x++)
4298 // colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4299 // if there is no color buffer, skip pixel shader
4300 while (startx < endx && !pixelmask[startx])
4302 while (endx > startx && !pixelmask[endx-1])
4305 continue; // no pixels to fill
4306 span->pixelmask = pixelmask;
4307 span->startx = startx;
4309 // run pixel shader if appropriate
4310 // do this before running depthmask code, to allow the pixelshader
4311 // to clear pixelmask values for alpha testing
4312 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4313 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4314 if (thread->depthmask)
4315 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4321 // no depth testing means we're just dealing with color...
4322 // if there is no color buffer, skip pixel shader
4323 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4325 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4326 span->pixelmask = pixelmask;
4327 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4331 thread->numspans = 0;
4334 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4336 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4339 int cullface = thread->cullface;
4340 int minx, maxx, miny, maxy;
4341 int miny1, maxy1, miny2, maxy2;
4342 __m128i fbmin, fbmax;
4343 __m128 viewportcenter, viewportscale;
4344 int firstvertex = command->firstvertex;
4345 int numvertices = command->numvertices;
4346 int numtriangles = command->numtriangles;
4347 const int *element3i = command->element3i;
4348 const unsigned short *element3s = command->element3s;
4349 int clipped = command->clipped;
4356 int starty, endy, bandy;
4360 __m128 triangleedge1, triangleedge2, trianglenormal;
4363 DPSOFTRAST_State_Triangle *triangle;
4364 DPSOFTRAST_Texture *texture;
4365 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4366 miny = thread->fb_scissor[1];
4367 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4368 miny1 = bound(miny, thread->miny1, maxy);
4369 maxy1 = bound(miny, thread->maxy1, maxy);
4370 miny2 = bound(miny, thread->miny2, maxy);
4371 maxy2 = bound(miny, thread->maxy2, maxy);
4372 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4374 if (!ATOMIC_DECREMENT(command->refcount))
4376 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4377 MM_FREE(command->arrays);
4381 minx = thread->fb_scissor[0];
4382 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4383 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4384 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4385 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4386 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4387 screen[3] = _mm_setzero_ps();
4388 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4389 for (i = 0;i < numtriangles;i++)
4391 const float *screencoord4f = command->arrays;
4392 const float *arrays = screencoord4f + numvertices*4;
4394 // generate the 3 edges of this triangle
4395 // generate spans for the triangle - switch based on left split or right split classification of triangle
4398 e[0] = element3s[i*3+0] - firstvertex;
4399 e[1] = element3s[i*3+1] - firstvertex;
4400 e[2] = element3s[i*3+2] - firstvertex;
4404 e[0] = element3i[i*3+0] - firstvertex;
4405 e[1] = element3i[i*3+1] - firstvertex;
4406 e[2] = element3i[i*3+2] - firstvertex;
4415 #define SKIPBACKFACE \
4416 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4417 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4418 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4419 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4420 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4424 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4428 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4433 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4434 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4436 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4437 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4439 #define CLIPPEDVERTEXCOPY(k,p1) \
4440 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4442 #define GENATTRIBCOPY(attrib, p1) \
4443 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4444 #define GENATTRIBLERP(attrib, p1, p2) \
4446 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4447 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4449 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4453 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4454 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4455 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4456 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4457 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4458 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4459 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4465 // calculate distance from nearplane
4466 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4467 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4468 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4469 if (clipdist[0] >= 0.0f)
4471 if (clipdist[1] >= 0.0f)
4473 if (clipdist[2] >= 0.0f)
4476 // triangle is entirely in front of nearplane
4477 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4484 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4492 if (clipdist[2] >= 0.0f)
4494 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4501 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4508 else if (clipdist[1] >= 0.0f)
4510 if (clipdist[2] >= 0.0f)
4512 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4519 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4525 else if (clipdist[2] >= 0.0f)
4527 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4532 else continue; // triangle is entirely behind nearplane
4535 // calculate integer y coords for triangle points
4536 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4537 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4538 screenmin = _mm_min_epi16(screeni, screenir),
4539 screenmax = _mm_max_epi16(screeni, screenir);
4540 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4541 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4542 screenmin = _mm_max_epi16(screenmin, fbmin);
4543 screenmax = _mm_min_epi16(screenmax, fbmax);
4544 // skip offscreen triangles
4545 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4547 starty = _mm_extract_epi16(screenmin, 1);
4548 endy = _mm_extract_epi16(screenmax, 1)+1;
4549 if (starty >= maxy1 && endy <= miny2)
4551 screeny = _mm_srai_epi32(screeni, 16);
4554 triangle = &thread->triangles[thread->numtriangles];
4556 // calculate attribute plans for triangle data...
4557 // okay, this triangle is going to produce spans, we'd better project
4558 // the interpolants now (this is what gives perspective texturing),
4559 // this consists of simply multiplying all arrays by the W coord
4560 // (which is basically 1/Z), which will be undone per-pixel
4561 // (multiplying by Z again) to get the perspective-correct array
4564 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4565 __m128 mipedgescale, mipdensity;
4566 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4567 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4568 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4569 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4570 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4571 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4572 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4573 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4574 attribedge1 = _mm_sub_ss(w0, w1);
4575 attribedge2 = _mm_sub_ss(w2, w1);
4576 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4577 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4578 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4579 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4580 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4581 _mm_store_ss(&triangle->w[0], attribxslope);
4582 _mm_store_ss(&triangle->w[1], attribyslope);
4583 _mm_store_ss(&triangle->w[2], attriborigin);
4584 mipedgescale = _mm_setzero_ps();
4585 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4587 __m128 attrib0, attrib1, attrib2;
4588 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4589 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4591 arrays += numvertices*4;
4592 GENATTRIBS(attrib0, attrib1, attrib2);
4593 attriborigin = _mm_mul_ps(attrib1, w1);
4594 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4595 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4596 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4597 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4598 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4599 _mm_stream_ps(triangle->attribs[k][0], attribxslope);
4600 _mm_stream_ps(triangle->attribs[k][1], attribyslope);
4601 _mm_stream_ps(triangle->attribs[k][2], attriborigin);
4602 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4604 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4605 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4606 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4607 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4611 memset(triangle->mip, 0, sizeof(triangle->mip));
4612 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4614 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4615 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4617 texture = thread->texbound[texunit];
4618 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4620 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4621 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4622 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4623 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4624 // this will be multiplied in the texturing routine by the texture resolution
4625 y = _mm_cvtss_si32(mipdensity);
4628 y = (int)(log((float)y)*0.5f/M_LN2);
4629 if (y > texture->mipmaps - 1)
4630 y = texture->mipmaps - 1;
4631 triangle->mip[texunit] = y;
4637 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4640 __m128 xcoords, xslope;
4641 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4642 int yccmask = _mm_movemask_epi8(ycc);
4643 int edge0p, edge0n, edge1p, edge1n;
4650 case 0xFFFF: /*0000*/ y = endy; continue;
4651 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4652 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4653 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4654 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4655 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4656 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4657 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4658 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4659 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4660 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4661 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4662 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4663 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4664 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4665 case 0x0000: /*1111*/ y++; continue;
4673 case 0xFFFF: /*000*/ y = endy; continue;
4674 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4675 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4676 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4677 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4678 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4679 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4680 case 0x0000: /*111*/ y++; continue;
4683 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
4684 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4685 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4686 nexty = _mm_extract_epi16(ycc, 0);
4687 if (nexty >= bandy) nexty = bandy-1;
4688 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
4689 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4690 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
4691 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4692 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4693 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
4695 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
4696 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
4698 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4700 int startx, endx, offset;
4701 startx = _mm_cvtss_si32(xcoords);
4702 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4705 if (startx < 0) startx = 0;
4706 startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
4708 if (endx > maxx) endx = maxx;
4709 if (startx >= endx) continue;
4710 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
4712 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4713 span->triangle = thread->numtriangles;
4716 span->startx = max(minx - offset, 0);
4717 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
4718 if (span->startx >= span->endx)
4720 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4721 DPSOFTRAST_Draw_ProcessSpans(thread);
4726 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
4728 DPSOFTRAST_Draw_ProcessSpans(thread);
4729 thread->numtriangles = 0;
4733 if (!ATOMIC_DECREMENT(command->refcount))
4735 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4736 MM_FREE(command->arrays);
4739 if (thread->numspans > 0 || thread->numtriangles > 0)
4741 DPSOFTRAST_Draw_ProcessSpans(thread);
4742 thread->numtriangles = 0;
4747 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4751 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
4752 int datasize = 2*numvertices*sizeof(float[4]);
4753 DPSOFTRAST_Command_Draw *command;
4754 unsigned char *data;
4755 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4757 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4758 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4760 datasize += numvertices*sizeof(float[4]);
4763 datasize += numtriangles*sizeof(unsigned short[3]);
4765 datasize += numtriangles*sizeof(int[3]);
4766 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
4767 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
4769 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
4770 data = (unsigned char *)MM_CALLOC(datasize, 1);
4774 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
4775 data = (unsigned char *)command + commandsize;
4777 command->firstvertex = firstvertex;
4778 command->numvertices = numvertices;
4779 command->numtriangles = numtriangles;
4780 command->arrays = (float *)data;
4781 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
4782 dpsoftrast.firstvertex = firstvertex;
4783 dpsoftrast.numvertices = numvertices;
4784 dpsoftrast.screencoord4f = (float *)data;
4785 data += numvertices*sizeof(float[4]);
4786 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
4787 data += numvertices*sizeof(float[4]);
4788 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4790 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4791 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4793 dpsoftrast.post_array4f[j] = (float *)data;
4794 data += numvertices*sizeof(float[4]);
4796 command->element3i = NULL;
4797 command->element3s = NULL;
4800 command->element3s = (unsigned short *)data;
4801 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
4805 command->element3i = (int *)data;
4806 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
4811 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4813 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
4814 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4815 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
4816 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
4817 if (command->starty >= command->endy)
4819 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4820 MM_FREE(command->arrays);
4821 DPSOFTRAST_UndoCommand(command->commandsize);
4824 command->clipped = dpsoftrast.drawclipped;
4825 command->refcount = dpsoftrast.numthreads;
4827 if (dpsoftrast.usethreads)
4830 DPSOFTRAST_Draw_SyncCommands();
4831 for (i = 0; i < dpsoftrast.numthreads; i++)
4833 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4834 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
4835 Thread_CondSignal(thread->drawcond);
4840 DPSOFTRAST_Draw_FlushThreads();
4844 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
4846 int commandoffset = thread->commandoffset;
4847 while (commandoffset != endoffset)
4849 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4850 switch (command->opcode)
4852 #define INTERPCOMMAND(name) \
4853 case DPSOFTRAST_OPCODE_##name : \
4854 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4855 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
4856 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4857 commandoffset = 0; \
4859 INTERPCOMMAND(Viewport)
4860 INTERPCOMMAND(ClearColor)
4861 INTERPCOMMAND(ClearDepth)
4862 INTERPCOMMAND(ColorMask)
4863 INTERPCOMMAND(DepthTest)
4864 INTERPCOMMAND(ScissorTest)
4865 INTERPCOMMAND(Scissor)
4866 INTERPCOMMAND(BlendFunc)
4867 INTERPCOMMAND(BlendSubtract)
4868 INTERPCOMMAND(DepthMask)
4869 INTERPCOMMAND(DepthFunc)
4870 INTERPCOMMAND(DepthRange)
4871 INTERPCOMMAND(PolygonOffset)
4872 INTERPCOMMAND(CullFace)
4873 INTERPCOMMAND(AlphaTest)
4874 INTERPCOMMAND(AlphaFunc)
4875 INTERPCOMMAND(SetTexture)
4876 INTERPCOMMAND(SetShader)
4877 INTERPCOMMAND(Uniform4f)
4878 INTERPCOMMAND(UniformMatrix4f)
4879 INTERPCOMMAND(Uniform1i)
4881 case DPSOFTRAST_OPCODE_Draw:
4882 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
4883 commandoffset += command->commandsize;
4884 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
4886 thread->commandoffset = commandoffset;
4889 case DPSOFTRAST_OPCODE_Reset:
4894 thread->commandoffset = commandoffset;
4897 static int DPSOFTRAST_Draw_Thread(void *data)
4899 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4900 while(thread->index >= 0)
4902 if (thread->commandoffset != dpsoftrast.drawcommand)
4904 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4908 Thread_LockMutex(thread->drawmutex);
4909 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
4911 if (thread->waiting) Thread_CondSignal(thread->waitcond);
4912 thread->starving = true;
4913 Thread_CondWait(thread->drawcond, thread->drawmutex);
4914 thread->starving = false;
4916 Thread_UnlockMutex(thread->drawmutex);
4922 static void DPSOFTRAST_Draw_FlushThreads(void)
4924 DPSOFTRAST_State_Thread *thread;
4926 DPSOFTRAST_Draw_SyncCommands();
4927 if (dpsoftrast.usethreads)
4929 for (i = 0; i < dpsoftrast.numthreads; i++)
4931 thread = &dpsoftrast.threads[i];
4932 if (thread->commandoffset != dpsoftrast.drawcommand)
4934 Thread_LockMutex(thread->drawmutex);
4935 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
4936 Thread_CondSignal(thread->drawcond);
4937 Thread_UnlockMutex(thread->drawmutex);
4940 for (i = 0; i < dpsoftrast.numthreads; i++)
4942 thread = &dpsoftrast.threads[i];
4943 if (thread->commandoffset != dpsoftrast.drawcommand)
4945 Thread_LockMutex(thread->drawmutex);
4946 if (thread->commandoffset != dpsoftrast.drawcommand)
4948 thread->waiting = true;
4949 Thread_CondWait(thread->waitcond, thread->drawmutex);
4950 thread->waiting = false;
4952 Thread_UnlockMutex(thread->drawmutex);
4958 for (i = 0; i < dpsoftrast.numthreads; i++)
4960 thread = &dpsoftrast.threads[i];
4961 if (thread->commandoffset != dpsoftrast.drawcommand)
4962 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4965 dpsoftrast.commandpool.usedcommands = 0;
4968 void DPSOFTRAST_Flush(void)
4970 DPSOFTRAST_Draw_FlushThreads();
4973 void DPSOFTRAST_Finish(void)
4978 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
4988 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4989 dpsoftrast.bigendian = u.b[3];
4990 dpsoftrast.fb_width = width;
4991 dpsoftrast.fb_height = height;
4992 dpsoftrast.fb_depthpixels = depthpixels;
4993 dpsoftrast.fb_colorpixels[0] = colorpixels;
4994 dpsoftrast.fb_colorpixels[1] = NULL;
4995 dpsoftrast.fb_colorpixels[1] = NULL;
4996 dpsoftrast.fb_colorpixels[1] = NULL;
4997 dpsoftrast.viewport[0] = 0;
4998 dpsoftrast.viewport[1] = 0;
4999 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5000 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5001 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5002 dpsoftrast.texture_firstfree = 1;
5003 dpsoftrast.texture_end = 1;
5004 dpsoftrast.texture_max = 0;
5005 dpsoftrast.color[0] = 1;
5006 dpsoftrast.color[1] = 1;
5007 dpsoftrast.color[2] = 1;
5008 dpsoftrast.color[3] = 1;
5009 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5010 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5011 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5012 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5013 for (i = 0; i < dpsoftrast.numthreads; i++)
5015 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5017 thread->cullface = GL_BACK;
5018 thread->colormask[1] = 1;
5019 thread->colormask[2] = 1;
5020 thread->colormask[3] = 1;
5021 thread->blendfunc[0] = GL_ONE;
5022 thread->blendfunc[1] = GL_ZERO;
5023 thread->depthmask = true;
5024 thread->depthtest = true;
5025 thread->depthfunc = GL_LEQUAL;
5026 thread->scissortest = false;
5027 thread->alphatest = false;
5028 thread->alphafunc = GL_GREATER;
5029 thread->alphavalue = 0.5f;
5030 thread->viewport[0] = 0;
5031 thread->viewport[1] = 0;
5032 thread->viewport[2] = dpsoftrast.fb_width;
5033 thread->viewport[3] = dpsoftrast.fb_height;
5034 thread->scissor[0] = 0;
5035 thread->scissor[1] = 0;
5036 thread->scissor[2] = dpsoftrast.fb_width;
5037 thread->scissor[3] = dpsoftrast.fb_height;
5038 thread->depthrange[0] = 0;
5039 thread->depthrange[1] = 1;
5040 thread->polygonoffset[0] = 0;
5041 thread->polygonoffset[1] = 0;
5043 if (dpsoftrast.interlace)
5045 thread->miny1 = (i*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5046 thread->maxy1 = ((i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5047 thread->miny2 = ((dpsoftrast.numthreads+i)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5048 thread->maxy2 = ((dpsoftrast.numthreads+i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5052 thread->miny1 = thread->miny2 = (i*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5053 thread->maxy1 = thread->maxy2 = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5056 thread->numspans = 0;
5057 thread->numtriangles = 0;
5058 thread->commandoffset = 0;
5059 thread->waiting = false;
5060 thread->starving = false;
5062 thread->validate = -1;
5063 DPSOFTRAST_Validate(thread, -1);
5065 if (dpsoftrast.usethreads)
5067 thread->waitcond = Thread_CreateCond();
5068 thread->drawcond = Thread_CreateCond();
5069 thread->drawmutex = Thread_CreateMutex();
5070 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5076 void DPSOFTRAST_Shutdown(void)
5079 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5081 DPSOFTRAST_State_Thread *thread;
5082 for (i = 0; i < dpsoftrast.numthreads; i++)
5084 thread = &dpsoftrast.threads[i];
5085 Thread_LockMutex(thread->drawmutex);
5087 Thread_CondSignal(thread->drawcond);
5088 Thread_UnlockMutex(thread->drawmutex);
5089 Thread_WaitThread(thread->thread, 0);
5090 Thread_DestroyCond(thread->waitcond);
5091 Thread_DestroyCond(thread->drawcond);
5092 Thread_DestroyMutex(thread->drawmutex);
5095 for (i = 0;i < dpsoftrast.texture_end;i++)
5096 if (dpsoftrast.texture[i].bytes)
5097 MM_FREE(dpsoftrast.texture[i].bytes);
5098 if (dpsoftrast.texture)
5099 free(dpsoftrast.texture);
5100 if (dpsoftrast.threads)
5101 MM_FREE(dpsoftrast.threads);
5102 memset(&dpsoftrast, 0, sizeof(dpsoftrast));