3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 typedef qboolean bool;
14 #define ATOMIC_SIZE 32
18 #define ALIGN(var) var __attribute__((__aligned__(16)))
19 #define ATOMIC(var) var __attribute__((__aligned__(32)))
20 #define MEMORY_BARRIER (_mm_sfence())
21 //(__sync_synchronize())
22 #define ATOMIC_COUNTER volatile int
23 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
24 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
25 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
26 #elif defined(_MSC_VER)
27 #define ALIGN(var) __declspec(align(16)) var
28 #define ATOMIC(var) __declspec(align(32)) var
29 #define MEMORY_BARRIER (_mm_sfence())
31 #define ATOMIC_COUNTER volatile LONG
32 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
33 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
34 #define ATOMIC_ADD(counter, val) (InterlockedExchangeAdd(&(counter), (val)))
39 #define ALIGN(var) var
42 #define ATOMIC(var) var
44 #ifndef MEMORY_BARRIER
45 #define MEMORY_BARRIER ((void)0)
47 #ifndef ATOMIC_COUNTER
48 #define ATOMIC_COUNTER int
50 #ifndef ATOMIC_INCREMENT
51 #define ATOMIC_INCREMENT(counter) (++(counter))
53 #ifndef ATOMIC_DECREMENT
54 #define ATOMIC_DECREMENT(counter) (--(counter))
57 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
61 #include <emmintrin.h>
63 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
65 static void *MM_CALLOC(size_t nmemb, size_t size)
67 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
68 if (ptr != NULL) memset(ptr, 0, nmemb*size);
72 #define MM_FREE _mm_free
74 #define MM_MALLOC(size) malloc(size)
75 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
79 typedef enum DPSOFTRAST_ARRAY_e
81 DPSOFTRAST_ARRAY_POSITION,
82 DPSOFTRAST_ARRAY_COLOR,
83 DPSOFTRAST_ARRAY_TEXCOORD0,
84 DPSOFTRAST_ARRAY_TEXCOORD1,
85 DPSOFTRAST_ARRAY_TEXCOORD2,
86 DPSOFTRAST_ARRAY_TEXCOORD3,
87 DPSOFTRAST_ARRAY_TEXCOORD4,
88 DPSOFTRAST_ARRAY_TEXCOORD5,
89 DPSOFTRAST_ARRAY_TEXCOORD6,
90 DPSOFTRAST_ARRAY_TEXCOORD7,
91 DPSOFTRAST_ARRAY_TOTAL
95 typedef struct DPSOFTRAST_Texture_s
102 DPSOFTRAST_TEXTURE_FILTER filter;
105 ATOMIC_COUNTER binds;
106 unsigned char *bytes;
107 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
111 #define COMMAND_SIZE ALIGN_SIZE
112 #define COMMAND_ALIGN(var) ALIGN(var)
114 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
116 unsigned char opcode;
117 unsigned short commandsize;
121 enum { DPSOFTRAST_OPCODE_Reset = 0 };
123 #define DEFCOMMAND(opcodeval, name, fields) \
124 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
125 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
127 unsigned char opcode; \
128 unsigned short commandsize; \
130 } DPSOFTRAST_Command_##name );
132 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
133 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
135 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
139 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
141 DPSOFTRAST_State_Command_Pool);
143 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
145 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
147 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
149 DPSOFTRAST_State_Triangle);
151 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
152 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
153 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
154 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
155 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
157 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
158 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
159 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
160 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
161 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
162 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
163 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
164 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
165 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
168 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
170 typedef ALIGN(struct DPSOFTRAST_State_Span_s
172 int triangle; // triangle this span was generated by
173 int x; // framebuffer x coord
174 int y; // framebuffer y coord
175 int startx; // usable range (according to pixelmask)
176 int endx; // usable range (according to pixelmask)
177 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
179 DPSOFTRAST_State_Span);
181 #define DPSOFTRAST_DRAW_MAXSPANS 1024
182 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
184 #define DPSOFTRAST_VALIDATE_FB 1
185 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
186 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
187 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
189 typedef enum DPSOFTRAST_BLENDMODE_e
191 DPSOFTRAST_BLENDMODE_OPAQUE,
192 DPSOFTRAST_BLENDMODE_ALPHA,
193 DPSOFTRAST_BLENDMODE_ADDALPHA,
194 DPSOFTRAST_BLENDMODE_ADD,
195 DPSOFTRAST_BLENDMODE_INVMOD,
196 DPSOFTRAST_BLENDMODE_MUL,
197 DPSOFTRAST_BLENDMODE_MUL2,
198 DPSOFTRAST_BLENDMODE_SUBALPHA,
199 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
200 DPSOFTRAST_BLENDMODE_INVADD,
201 DPSOFTRAST_BLENDMODE_TOTAL
203 DPSOFTRAST_BLENDMODE;
205 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
224 float polygonoffset[2];
227 int shader_permutation;
229 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
231 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
232 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
234 // DPSOFTRAST_VALIDATE_ flags
237 // derived values (DPSOFTRAST_VALIDATE_FB)
240 ALIGN(float fb_viewportcenter[4]);
241 ALIGN(float fb_viewportscale[4]);
243 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
246 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
255 ATOMIC(volatile int commandoffset);
257 volatile bool waiting;
258 volatile bool starving;
265 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
266 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
268 DPSOFTRAST_State_Thread);
270 typedef ATOMIC(struct DPSOFTRAST_State_s
274 unsigned int *fb_depthpixels;
275 unsigned int *fb_colorpixels[4];
278 ALIGN(float fb_viewportcenter[4]);
279 ALIGN(float fb_viewportscale[4]);
282 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
283 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
285 const float *pointer_vertex3f;
286 const float *pointer_color4f;
287 const unsigned char *pointer_color4ub;
288 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
291 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
292 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
293 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
297 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
298 float *screencoord4f;
304 int shader_permutation;
308 int texture_firstfree;
309 DPSOFTRAST_Texture *texture;
314 const char *errorstring;
319 DPSOFTRAST_State_Thread *threads;
321 ATOMIC(volatile int drawcommand);
323 DPSOFTRAST_State_Command_Pool commandpool;
327 DPSOFTRAST_State dpsoftrast;
329 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
330 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
331 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
332 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
333 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
335 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
337 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
338 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
339 fb_viewportcenter[3] = 0.5f;
340 fb_viewportcenter[0] = 0.0f;
341 fb_viewportscale[1] = 0.5f * viewport[2];
342 fb_viewportscale[2] = -0.5f * viewport[3];
343 fb_viewportscale[3] = 0.5f;
344 fb_viewportscale[0] = 1.0f;
347 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
349 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
350 // and viewport projection values
353 x1 = thread->scissor[0];
354 x2 = thread->scissor[0] + thread->scissor[2];
355 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
356 y2 = dpsoftrast.fb_height - thread->scissor[1];
357 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
359 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
361 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
362 thread->fb_scissor[0] = x1;
363 thread->fb_scissor[1] = y1;
364 thread->fb_scissor[2] = x2 - x1;
365 thread->fb_scissor[3] = y2 - y1;
367 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
370 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
372 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
375 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
377 if (thread->blendsubtract)
379 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
381 #define BLENDFUNC(sfactor, dfactor, blendmode) \
382 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
383 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
384 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
389 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
391 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
392 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
393 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
394 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
395 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
396 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
397 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
398 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
399 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
400 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
401 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
406 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
408 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
410 mask &= thread->validate;
413 if (mask & DPSOFTRAST_VALIDATE_FB)
415 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
416 DPSOFTRAST_RecalcFB(thread);
418 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
420 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
421 DPSOFTRAST_RecalcDepthFunc(thread);
423 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
425 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
426 DPSOFTRAST_RecalcBlendFunc(thread);
430 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
432 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
433 return &dpsoftrast.texture[index];
437 static void DPSOFTRAST_Texture_Grow(void)
439 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
440 DPSOFTRAST_State_Thread *thread;
444 // expand texture array as needed
445 if (dpsoftrast.texture_max < 1024)
446 dpsoftrast.texture_max = 1024;
448 dpsoftrast.texture_max *= 2;
449 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
450 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
451 if (dpsoftrast.texbound[i])
452 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
453 for (j = 0; j < dpsoftrast.numthreads; j++)
455 thread = &dpsoftrast.threads[j];
456 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
457 if (thread->texbound[i])
458 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
462 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
471 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
472 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
473 DPSOFTRAST_Texture *texture;
474 if (width*height*depth < 1)
476 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
479 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
481 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
486 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
487 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
488 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
490 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
491 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
493 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
498 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
501 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
503 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
508 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
510 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
513 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
515 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
518 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
520 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
523 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
525 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
528 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
530 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
533 // find first empty slot in texture array
534 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
535 if (!dpsoftrast.texture[texnum].bytes)
537 dpsoftrast.texture_firstfree = texnum + 1;
538 if (dpsoftrast.texture_max <= texnum)
539 DPSOFTRAST_Texture_Grow();
540 if (dpsoftrast.texture_end <= texnum)
541 dpsoftrast.texture_end = texnum + 1;
542 texture = &dpsoftrast.texture[texnum];
543 memset(texture, 0, sizeof(*texture));
544 texture->flags = flags;
545 texture->width = width;
546 texture->height = height;
547 texture->depth = depth;
548 texture->sides = sides;
560 s = w * h * d * sides * 4;
561 texture->mipmap[mipmaps][0] = size;
562 texture->mipmap[mipmaps][1] = s;
563 texture->mipmap[mipmaps][2] = w;
564 texture->mipmap[mipmaps][3] = h;
565 texture->mipmap[mipmaps][4] = d;
568 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
574 texture->mipmaps = mipmaps;
575 texture->size = size;
577 // allocate the pixels now
578 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
582 void DPSOFTRAST_Texture_Free(int index)
584 DPSOFTRAST_Texture *texture;
585 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
589 MM_FREE(texture->bytes);
590 texture->bytes = NULL;
591 memset(texture, 0, sizeof(*texture));
592 // adjust the free range and used range
593 if (dpsoftrast.texture_firstfree > index)
594 dpsoftrast.texture_firstfree = index;
595 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
596 dpsoftrast.texture_end--;
598 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
600 int i, x, y, z, w, layer0, layer1, row0, row1;
601 unsigned char *o, *i0, *i1, *i2, *i3;
602 DPSOFTRAST_Texture *texture;
603 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
604 if (texture->mipmaps <= 1)
606 for (i = 1;i < texture->mipmaps;i++)
608 for (z = 0;z < texture->mipmap[i][4];z++)
612 if (layer1 >= texture->mipmap[i-1][4])
613 layer1 = texture->mipmap[i-1][4]-1;
614 for (y = 0;y < texture->mipmap[i][3];y++)
618 if (row1 >= texture->mipmap[i-1][3])
619 row1 = texture->mipmap[i-1][3]-1;
620 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
621 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
622 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
623 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
624 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
625 w = texture->mipmap[i][2];
628 if (texture->mipmap[i-1][2] > 1)
630 // average 3D texture
631 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
633 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
634 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
635 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
636 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
641 // average 3D mipmap with parent width == 1
642 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
644 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
645 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
646 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
647 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
653 if (texture->mipmap[i-1][2] > 1)
655 // average 2D texture (common case)
656 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
658 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
659 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
660 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
661 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
666 // 2D texture with parent width == 1
667 o[0] = (i0[0] + i1[0] + 1) >> 1;
668 o[1] = (i0[1] + i1[1] + 1) >> 1;
669 o[2] = (i0[2] + i1[2] + 1) >> 1;
670 o[3] = (i0[3] + i1[3] + 1) >> 1;
677 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
679 DPSOFTRAST_Texture *texture;
681 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
684 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
685 while (blockheight > 0)
687 memcpy(dst, pixels, blockwidth * 4);
688 pixels += blockwidth * 4;
689 dst += texture->mipmap[0][2] * 4;
692 DPSOFTRAST_Texture_CalculateMipmaps(index);
694 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
696 DPSOFTRAST_Texture *texture;
697 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
700 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
701 DPSOFTRAST_Texture_CalculateMipmaps(index);
703 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
705 DPSOFTRAST_Texture *texture;
706 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
707 return texture->mipmap[mip][2];
709 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
711 DPSOFTRAST_Texture *texture;
712 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
713 return texture->mipmap[mip][3];
715 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
717 DPSOFTRAST_Texture *texture;
718 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
719 return texture->mipmap[mip][4];
721 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
723 DPSOFTRAST_Texture *texture;
724 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
727 return texture->bytes + texture->mipmap[mip][0];
729 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
731 DPSOFTRAST_Texture *texture;
732 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
733 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
735 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
740 texture->filter = filter;
743 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
745 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
746 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
747 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
749 dpsoftrast.fb_width = width;
750 dpsoftrast.fb_height = height;
751 dpsoftrast.fb_depthpixels = depthpixels;
752 dpsoftrast.fb_colorpixels[0] = colorpixels0;
753 dpsoftrast.fb_colorpixels[1] = colorpixels1;
754 dpsoftrast.fb_colorpixels[2] = colorpixels2;
755 dpsoftrast.fb_colorpixels[3] = colorpixels3;
758 static void DPSOFTRAST_Draw_FlushThreads(void);
760 static void DPSOFTRAST_Draw_SyncCommands(void)
762 if(dpsoftrast.usethreads) MEMORY_BARRIER;
763 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
766 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
768 DPSOFTRAST_State_Thread *thread;
770 int freecommand = dpsoftrast.commandpool.freecommand;
771 int usedcommands = dpsoftrast.commandpool.usedcommands;
772 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
774 DPSOFTRAST_Draw_SyncCommands();
780 for (i = 0; i < dpsoftrast.numthreads; i++)
782 thread = &dpsoftrast.threads[i];
783 commandoffset = freecommand - thread->commandoffset;
784 if (commandoffset < 0)
785 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
786 if (commandoffset > usedcommands)
789 usedcommands = commandoffset;
792 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
794 thread = &dpsoftrast.threads[waitindex];
795 Thread_LockMutex(thread->drawmutex);
796 if (thread->commandoffset != dpsoftrast.drawcommand)
798 thread->waiting = true;
799 if (thread->starving) Thread_CondSignal(thread->drawcond);
800 Thread_CondWait(thread->waitcond, thread->drawmutex);
801 thread->waiting = false;
803 Thread_UnlockMutex(thread->drawmutex);
805 dpsoftrast.commandpool.usedcommands = usedcommands;
808 #define DPSOFTRAST_ALIGNCOMMAND(size) \
809 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
810 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
811 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
813 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
815 DPSOFTRAST_Command *command;
816 int freecommand = dpsoftrast.commandpool.freecommand;
817 int usedcommands = dpsoftrast.commandpool.usedcommands;
818 int extra = sizeof(DPSOFTRAST_Command);
819 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
820 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
821 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
823 if (dpsoftrast.usethreads)
824 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
826 DPSOFTRAST_Draw_FlushThreads();
827 freecommand = dpsoftrast.commandpool.freecommand;
828 usedcommands = dpsoftrast.commandpool.usedcommands;
830 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
832 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
833 command->opcode = DPSOFTRAST_OPCODE_Reset;
834 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
837 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
838 command->opcode = opcode;
839 command->commandsize = size;
841 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
843 dpsoftrast.commandpool.freecommand = freecommand;
844 dpsoftrast.commandpool.usedcommands = usedcommands + size;
848 static void DPSOFTRAST_UndoCommand(int size)
850 int freecommand = dpsoftrast.commandpool.freecommand;
851 int usedcommands = dpsoftrast.commandpool.usedcommands;
854 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
855 usedcommands -= size;
856 dpsoftrast.commandpool.freecommand = freecommand;
857 dpsoftrast.commandpool.usedcommands = usedcommands;
860 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
861 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
863 thread->viewport[0] = command->x;
864 thread->viewport[1] = command->y;
865 thread->viewport[2] = command->width;
866 thread->viewport[3] = command->height;
867 thread->validate |= DPSOFTRAST_VALIDATE_FB;
869 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
871 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
874 command->width = width;
875 command->height = height;
877 dpsoftrast.viewport[0] = x;
878 dpsoftrast.viewport[1] = y;
879 dpsoftrast.viewport[2] = width;
880 dpsoftrast.viewport[3] = height;
881 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
884 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
885 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
887 int i, x1, y1, x2, y2, w, h, x, y;
888 int miny1 = thread->miny1;
889 int maxy1 = thread->maxy1;
890 int miny2 = thread->miny2;
891 int maxy2 = thread->maxy2;
895 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
896 x1 = thread->fb_scissor[0];
897 y1 = thread->fb_scissor[1];
898 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
899 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
900 if (y1 < miny1) y1 = miny1;
901 if (y2 > maxy2) y2 = maxy2;
906 // FIXME: honor fb_colormask?
907 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
908 for (i = 0;i < 4;i++)
910 if (!dpsoftrast.fb_colorpixels[i])
912 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
915 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
916 for (x = x1;x < x2;x++)
921 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
923 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
930 DEFCOMMAND(3, ClearDepth, float depth;)
931 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
933 int x1, y1, x2, y2, w, h, x, y;
934 int miny1 = thread->miny1;
935 int maxy1 = thread->maxy1;
936 int miny2 = thread->miny2;
937 int maxy2 = thread->maxy2;
941 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
942 x1 = thread->fb_scissor[0];
943 y1 = thread->fb_scissor[1];
944 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
945 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
946 if (y1 < miny1) y1 = miny1;
947 if (y2 > maxy2) y2 = maxy2;
952 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
953 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
956 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
957 for (x = x1;x < x2;x++)
961 void DPSOFTRAST_ClearDepth(float d)
963 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
967 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
968 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
970 thread->colormask[0] = command->r != 0;
971 thread->colormask[1] = command->g != 0;
972 thread->colormask[2] = command->b != 0;
973 thread->colormask[3] = command->a != 0;
974 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
976 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
978 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
985 DEFCOMMAND(5, DepthTest, int enable;)
986 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
988 thread->depthtest = command->enable;
989 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
991 void DPSOFTRAST_DepthTest(int enable)
993 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
994 command->enable = enable;
997 DEFCOMMAND(6, ScissorTest, int enable;)
998 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1000 thread->scissortest = command->enable;
1001 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1003 void DPSOFTRAST_ScissorTest(int enable)
1005 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1006 command->enable = enable;
1009 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1010 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1012 thread->scissor[0] = command->x;
1013 thread->scissor[1] = command->y;
1014 thread->scissor[2] = command->width;
1015 thread->scissor[3] = command->height;
1016 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1018 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1020 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1023 command->width = width;
1024 command->height = height;
1027 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1028 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1030 thread->blendfunc[0] = command->sfactor;
1031 thread->blendfunc[1] = command->dfactor;
1032 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1034 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1036 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1037 command->sfactor = sfactor;
1038 command->dfactor = dfactor;
1041 DEFCOMMAND(9, BlendSubtract, int enable;)
1042 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1044 thread->blendsubtract = command->enable;
1045 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1047 void DPSOFTRAST_BlendSubtract(int enable)
1049 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1050 command->enable = enable;
1053 DEFCOMMAND(10, DepthMask, int enable;)
1054 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1056 thread->depthmask = command->enable;
1058 void DPSOFTRAST_DepthMask(int enable)
1060 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1061 command->enable = enable;
1064 DEFCOMMAND(11, DepthFunc, int func;)
1065 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1067 thread->depthfunc = command->func;
1069 void DPSOFTRAST_DepthFunc(int func)
1071 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1072 command->func = func;
1075 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1076 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1078 thread->depthrange[0] = command->nearval;
1079 thread->depthrange[1] = command->farval;
1081 void DPSOFTRAST_DepthRange(float nearval, float farval)
1083 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1084 command->nearval = nearval;
1085 command->farval = farval;
1088 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1089 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1091 thread->polygonoffset[0] = command->alongnormal;
1092 thread->polygonoffset[1] = command->intoview;
1094 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1096 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1097 command->alongnormal = alongnormal;
1098 command->intoview = intoview;
1101 DEFCOMMAND(14, CullFace, int mode;)
1102 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1104 thread->cullface = command->mode;
1106 void DPSOFTRAST_CullFace(int mode)
1108 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1109 command->mode = mode;
1112 DEFCOMMAND(15, AlphaTest, int enable;)
1113 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1115 thread->alphatest = command->enable;
1117 void DPSOFTRAST_AlphaTest(int enable)
1119 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1120 command->enable = enable;
1123 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1124 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1126 thread->alphafunc = command->func;
1127 thread->alphavalue = command->ref;
1129 void DPSOFTRAST_AlphaFunc(int func, float ref)
1131 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1132 command->func = func;
1136 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1138 dpsoftrast.color[0] = r;
1139 dpsoftrast.color[1] = g;
1140 dpsoftrast.color[2] = b;
1141 dpsoftrast.color[3] = a;
1144 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1146 int outstride = blockwidth * 4;
1147 int instride = dpsoftrast.fb_width * 4;
1150 int bx2 = blockx + blockwidth;
1151 int by2 = blocky + blockheight;
1156 unsigned char *inpixels;
1160 if (bx1 < 0) bx1 = 0;
1161 if (by1 < 0) by1 = 0;
1162 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1163 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1166 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1167 if (dpsoftrast.bigendian)
1169 for (y = by1;y < by2;y++)
1171 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1172 o = (unsigned char *)outpixels + (y - by1) * outstride;
1173 for (x = bx1;x < bx2;x++)
1186 for (y = by1;y < by2;y++)
1188 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1189 o = (unsigned char *)outpixels + (y - by1) * outstride;
1195 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1199 int tx2 = tx + width;
1200 int ty2 = ty + height;
1203 int sx2 = sx + width;
1204 int sy2 = sy + height;
1214 unsigned int *spixels;
1215 unsigned int *tpixels;
1216 DPSOFTRAST_Texture *texture;
1217 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1218 if (mip < 0 || mip >= texture->mipmaps) return;
1220 spixels = dpsoftrast.fb_colorpixels[0];
1221 swidth = dpsoftrast.fb_width;
1222 sheight = dpsoftrast.fb_height;
1223 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1224 twidth = texture->mipmap[mip][2];
1225 theight = texture->mipmap[mip][3];
1226 if (tx1 < 0) tx1 = 0;
1227 if (ty1 < 0) ty1 = 0;
1228 if (tx2 > twidth) tx2 = twidth;
1229 if (ty2 > theight) ty2 = theight;
1230 if (sx1 < 0) sx1 = 0;
1231 if (sy1 < 0) sy1 = 0;
1232 if (sx2 > swidth) sx2 = swidth;
1233 if (sy2 > sheight) sy2 = sheight;
1238 if (tw > sw) tw = sw;
1239 if (th > sh) th = sh;
1240 if (tw < 1 || th < 1)
1242 for (y = 0;y < th;y++)
1243 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1244 if (texture->mipmaps > 1)
1245 DPSOFTRAST_Texture_CalculateMipmaps(index);
1248 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1249 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1251 if (thread->texbound[command->unitnum])
1252 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1253 thread->texbound[command->unitnum] = command->texture;
1255 void DPSOFTRAST_SetTexture(int unitnum, int index)
1257 DPSOFTRAST_Command_SetTexture *command;
1258 DPSOFTRAST_Texture *texture;
1259 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1261 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1264 texture = DPSOFTRAST_Texture_GetByIndex(index);
1265 if (index && !texture)
1267 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1271 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1272 command->unitnum = unitnum;
1273 command->texture = texture;
1275 dpsoftrast.texbound[unitnum] = texture;
1276 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1279 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1281 dpsoftrast.pointer_vertex3f = vertex3f;
1282 dpsoftrast.stride_vertex = stride;
1284 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1286 dpsoftrast.pointer_color4f = color4f;
1287 dpsoftrast.pointer_color4ub = NULL;
1288 dpsoftrast.stride_color = stride;
1290 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1292 dpsoftrast.pointer_color4f = NULL;
1293 dpsoftrast.pointer_color4ub = color4ub;
1294 dpsoftrast.stride_color = stride;
1296 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1298 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1299 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1300 dpsoftrast.stride_texcoord[unitnum] = stride;
1303 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1304 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1306 thread->shader_mode = command->mode;
1307 thread->shader_permutation = command->permutation;
1309 void DPSOFTRAST_SetShader(int mode, int permutation)
1311 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1312 command->mode = mode;
1313 command->permutation = permutation;
1315 dpsoftrast.shader_mode = mode;
1316 dpsoftrast.shader_permutation = permutation;
1319 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1320 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1322 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1324 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1326 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1327 command->index = index;
1328 command->val[0] = v0;
1329 command->val[1] = v1;
1330 command->val[2] = v2;
1331 command->val[3] = v3;
1333 dpsoftrast.uniform4f[index*4+0] = v0;
1334 dpsoftrast.uniform4f[index*4+1] = v1;
1335 dpsoftrast.uniform4f[index*4+2] = v2;
1336 dpsoftrast.uniform4f[index*4+3] = v3;
1338 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1340 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1341 command->index = index;
1342 memcpy(command->val, v, sizeof(command->val));
1344 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1347 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1348 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1350 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1352 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1356 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1358 __m128 m0, m1, m2, m3;
1359 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1360 command->index = index;
1361 if (((size_t)v)&(ALIGN_SIZE-1))
1363 m0 = _mm_loadu_ps(v);
1364 m1 = _mm_loadu_ps(v+4);
1365 m2 = _mm_loadu_ps(v+8);
1366 m3 = _mm_loadu_ps(v+12);
1370 m0 = _mm_load_ps(v);
1371 m1 = _mm_load_ps(v+4);
1372 m2 = _mm_load_ps(v+8);
1373 m3 = _mm_load_ps(v+12);
1377 __m128 t0, t1, t2, t3;
1378 t0 = _mm_unpacklo_ps(m0, m1);
1379 t1 = _mm_unpacklo_ps(m2, m3);
1380 t2 = _mm_unpackhi_ps(m0, m1);
1381 t3 = _mm_unpackhi_ps(m2, m3);
1382 m0 = _mm_movelh_ps(t0, t1);
1383 m1 = _mm_movehl_ps(t1, t0);
1384 m2 = _mm_movelh_ps(t2, t3);
1385 m3 = _mm_movehl_ps(t3, t2);
1387 _mm_store_ps(command->val, m0);
1388 _mm_store_ps(command->val+4, m1);
1389 _mm_store_ps(command->val+8, m2);
1390 _mm_store_ps(command->val+12, m3);
1391 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1392 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1393 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1394 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1399 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1400 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1402 thread->uniform1i[command->index] = command->val;
1404 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1406 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1407 command->index = index;
1410 dpsoftrast.uniform1i[command->index] = i0;
1414 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1416 float *end = dst + size*4;
1417 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1421 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1430 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1437 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1439 float *end = dst + size*4;
1440 if (stride == sizeof(float[3]))
1442 float *end4 = dst + (size&~3)*4;
1443 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1447 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1448 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1449 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1450 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1451 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1452 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1453 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1454 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1455 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1456 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1457 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1458 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1459 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1461 src += 4*sizeof(float[3]);
1468 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1469 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1470 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1471 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1472 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1473 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1474 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1475 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1476 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1477 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1478 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1479 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1480 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1482 src += 4*sizeof(float[3]);
1486 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1490 __m128 v = _mm_loadu_ps((const float *)src);
1491 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1492 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1493 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1494 _mm_store_ps(dst, v);
1503 __m128 v = _mm_load_ps((const float *)src);
1504 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1505 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1506 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1507 _mm_store_ps(dst, v);
1514 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1516 float *end = dst + size*4;
1517 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1518 if (stride == sizeof(float[2]))
1520 float *end2 = dst + (size&~1)*4;
1521 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1525 __m128 v = _mm_loadu_ps((const float *)src);
1526 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1527 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1529 src += 2*sizeof(float[2]);
1536 __m128 v = _mm_load_ps((const float *)src);
1537 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1538 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1540 src += 2*sizeof(float[2]);
1546 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1552 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1554 float *end = dst + size*4;
1555 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1556 if (stride == sizeof(unsigned char[4]))
1558 float *end4 = dst + (size&~3)*4;
1559 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1563 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1564 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1565 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1566 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1567 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1569 src += 4*sizeof(unsigned char[4]);
1576 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1577 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1578 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1579 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1580 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1582 src += 4*sizeof(unsigned char[4]);
1588 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1589 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1595 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1597 float *end = dst + 4*size;
1598 __m128 v = _mm_loadu_ps(src);
1601 _mm_store_ps(dst, v);
1607 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1610 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1611 __m128 m0, m1, m2, m3;
1613 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1615 // fast case for identity matrix
1616 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1619 end = out4f + numitems*4;
1620 m0 = _mm_loadu_ps(inmatrix16f);
1621 m1 = _mm_loadu_ps(inmatrix16f + 4);
1622 m2 = _mm_loadu_ps(inmatrix16f + 8);
1623 m3 = _mm_loadu_ps(inmatrix16f + 12);
1624 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1628 __m128 v = _mm_loadu_ps(in4f);
1630 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1631 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1632 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1633 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1642 __m128 v = _mm_load_ps(in4f);
1644 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1645 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1646 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1647 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1655 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1657 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1661 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1663 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1664 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1665 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1666 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1669 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1671 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1672 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1673 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1674 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1677 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1680 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1681 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1682 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1683 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1686 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1688 int clipmask = 0xFF;
1689 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1690 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1691 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1692 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1693 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1694 #define BBFRONT(k, pos) \
1696 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1697 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1698 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1701 clipmask &= ~(1<<k); \
1702 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1703 minproj = _mm_min_ss(minproj, proj); \
1704 maxproj = _mm_max_ss(maxproj, proj); \
1708 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1709 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1710 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1711 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1712 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1713 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1717 if (clipmask&(1<<k)) \
1719 if (!(clipmask&(1<<(k^1)))) \
1721 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1722 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1723 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1724 minproj = _mm_min_ss(minproj, proj); \
1725 maxproj = _mm_max_ss(maxproj, proj); \
1727 if (!(clipmask&(1<<(k^2)))) \
1729 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1730 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1731 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1732 minproj = _mm_min_ss(minproj, proj); \
1733 maxproj = _mm_max_ss(maxproj, proj); \
1735 if (!(clipmask&(1<<(k^4)))) \
1737 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1738 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1739 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1740 minproj = _mm_min_ss(minproj, proj); \
1741 maxproj = _mm_max_ss(maxproj, proj); \
1745 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1746 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1747 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1748 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1749 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1750 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1751 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1752 *starty = _mm_cvttss_si32(maxproj);
1753 *endy = _mm_cvttss_si32(minproj)+1;
1757 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1759 float *end = out4f + numitems*4;
1760 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1761 __m128 minpos, maxpos;
1762 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1764 minpos = maxpos = _mm_loadu_ps(in4f);
1767 __m128 v = _mm_loadu_ps(in4f);
1768 minpos = _mm_min_ps(minpos, v);
1769 maxpos = _mm_max_ps(maxpos, v);
1770 _mm_store_ps(out4f, v);
1771 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1772 _mm_store_ps(screen4f, v);
1780 minpos = maxpos = _mm_load_ps(in4f);
1783 __m128 v = _mm_load_ps(in4f);
1784 minpos = _mm_min_ps(minpos, v);
1785 maxpos = _mm_max_ps(maxpos, v);
1786 _mm_store_ps(out4f, v);
1787 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1788 _mm_store_ps(screen4f, v);
1795 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale,
1796 _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1797 _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1798 _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1799 _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1803 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1805 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1806 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1808 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1809 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1810 end = out4f + numitems*4;
1811 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1812 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1813 m0 = _mm_loadu_ps(inmatrix16f);
1814 m1 = _mm_loadu_ps(inmatrix16f + 4);
1815 m2 = _mm_loadu_ps(inmatrix16f + 8);
1816 m3 = _mm_loadu_ps(inmatrix16f + 12);
1817 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1819 minpos = maxpos = _mm_loadu_ps(in4f);
1822 __m128 v = _mm_loadu_ps(in4f);
1823 minpos = _mm_min_ps(minpos, v);
1824 maxpos = _mm_max_ps(maxpos, v);
1825 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1826 _mm_store_ps(out4f, v);
1827 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1828 _mm_store_ps(screen4f, v);
1836 minpos = maxpos = _mm_load_ps(in4f);
1839 __m128 v = _mm_load_ps(in4f);
1840 minpos = _mm_min_ps(minpos, v);
1841 maxpos = _mm_max_ps(maxpos, v);
1842 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1843 _mm_store_ps(out4f, v);
1844 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1845 _mm_store_ps(screen4f, v);
1852 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3);
1857 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1860 float *outf = dpsoftrast.post_array4f[outarray];
1861 const unsigned char *inb;
1862 int firstvertex = dpsoftrast.firstvertex;
1863 int numvertices = dpsoftrast.numvertices;
1867 case DPSOFTRAST_ARRAY_POSITION:
1868 stride = dpsoftrast.stride_vertex;
1869 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1870 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1872 case DPSOFTRAST_ARRAY_COLOR:
1873 stride = dpsoftrast.stride_color;
1874 if (dpsoftrast.pointer_color4f)
1876 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1877 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1879 else if (dpsoftrast.pointer_color4ub)
1881 stride = dpsoftrast.stride_color;
1882 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1883 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1887 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1891 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1892 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1894 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1895 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1898 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1901 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1904 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1916 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1918 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1919 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1924 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1927 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1928 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1936 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1939 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1940 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1947 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1950 int startx = span->startx;
1951 int endx = span->endx;
1952 float wslope = triangle->w[0];
1953 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1954 float endz = 1.0f / (w + wslope * startx);
1955 for (x = startx;x < endx;)
1957 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1959 if (nextsub >= endx) nextsub = endsub = endx-1;
1960 endz = 1.0f / (w + wslope * nextsub);
1961 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1962 for (; x <= endsub; x++, z += dz)
1967 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1970 int startx = span->startx;
1971 int endx = span->endx;
1974 unsigned char * RESTRICT pixelmask = span->pixelmask;
1975 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1978 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1979 // handle alphatest now (this affects depth writes too)
1980 if (thread->alphatest)
1981 for (x = startx;x < endx;x++)
1982 if (in4f[x*4+3] < 0.5f)
1983 pixelmask[x] = false;
1984 // FIXME: this does not handle bigendian
1985 switch(thread->fb_blendmode)
1987 case DPSOFTRAST_BLENDMODE_OPAQUE:
1988 for (x = startx;x < endx;x++)
1992 d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
1993 d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
1994 d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
1995 d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
1996 pixel[x*4+0] = d[0];
1997 pixel[x*4+1] = d[1];
1998 pixel[x*4+2] = d[2];
1999 pixel[x*4+3] = d[3];
2002 case DPSOFTRAST_BLENDMODE_ALPHA:
2003 for (x = startx;x < endx;x++)
2007 a = in4f[x*4+3] * 255.0f;
2008 b = 1.0f - in4f[x*4+3];
2009 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2010 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2011 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2012 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2013 pixel[x*4+0] = d[0];
2014 pixel[x*4+1] = d[1];
2015 pixel[x*4+2] = d[2];
2016 pixel[x*4+3] = d[3];
2019 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2020 for (x = startx;x < endx;x++)
2024 a = in4f[x*4+3] * 255.0f;
2025 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2026 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2027 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2028 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2029 pixel[x*4+0] = d[0];
2030 pixel[x*4+1] = d[1];
2031 pixel[x*4+2] = d[2];
2032 pixel[x*4+3] = d[3];
2035 case DPSOFTRAST_BLENDMODE_ADD:
2036 for (x = startx;x < endx;x++)
2040 d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2041 d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2042 d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2043 d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2044 pixel[x*4+0] = d[0];
2045 pixel[x*4+1] = d[1];
2046 pixel[x*4+2] = d[2];
2047 pixel[x*4+3] = d[3];
2050 case DPSOFTRAST_BLENDMODE_INVMOD:
2051 for (x = startx;x < endx;x++)
2055 d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2056 d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2057 d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2058 d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2059 pixel[x*4+0] = d[0];
2060 pixel[x*4+1] = d[1];
2061 pixel[x*4+2] = d[2];
2062 pixel[x*4+3] = d[3];
2065 case DPSOFTRAST_BLENDMODE_MUL:
2066 for (x = startx;x < endx;x++)
2070 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2071 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2072 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2073 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2074 pixel[x*4+0] = d[0];
2075 pixel[x*4+1] = d[1];
2076 pixel[x*4+2] = d[2];
2077 pixel[x*4+3] = d[3];
2080 case DPSOFTRAST_BLENDMODE_MUL2:
2081 for (x = startx;x < endx;x++)
2085 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2086 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2087 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2088 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2089 pixel[x*4+0] = d[0];
2090 pixel[x*4+1] = d[1];
2091 pixel[x*4+2] = d[2];
2092 pixel[x*4+3] = d[3];
2095 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2096 for (x = startx;x < endx;x++)
2100 a = in4f[x*4+3] * -255.0f;
2101 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2102 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2103 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2104 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2105 pixel[x*4+0] = d[0];
2106 pixel[x*4+1] = d[1];
2107 pixel[x*4+2] = d[2];
2108 pixel[x*4+3] = d[3];
2111 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2112 for (x = startx;x < endx;x++)
2117 b = 1.0f - in4f[x*4+3];
2118 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2119 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2120 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2121 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2122 pixel[x*4+0] = d[0];
2123 pixel[x*4+1] = d[1];
2124 pixel[x*4+2] = d[2];
2125 pixel[x*4+3] = d[3];
2128 case DPSOFTRAST_BLENDMODE_INVADD:
2129 for (x = startx;x < endx;x++)
2133 d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2134 d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2135 d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2136 d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2137 pixel[x*4+0] = d[0];
2138 pixel[x*4+1] = d[1];
2139 pixel[x*4+2] = d[2];
2140 pixel[x*4+3] = d[3];
2146 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2150 int startx = span->startx;
2151 int endx = span->endx;
2152 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2153 unsigned char * RESTRICT pixelmask = span->pixelmask;
2154 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2155 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2158 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2159 pixeli += span->y * dpsoftrast.fb_width + span->x;
2160 // handle alphatest now (this affects depth writes too)
2161 if (thread->alphatest)
2162 for (x = startx;x < endx;x++)
2163 if (in4ub[x*4+3] < 0.5f)
2164 pixelmask[x] = false;
2165 // FIXME: this does not handle bigendian
2166 switch(thread->fb_blendmode)
2168 case DPSOFTRAST_BLENDMODE_OPAQUE:
2169 for (x = startx;x + 4 <= endx;)
2171 if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2173 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2187 case DPSOFTRAST_BLENDMODE_ALPHA:
2188 #define FINISHBLEND(blend2, blend1) \
2189 for (x = startx;x + 1 < endx;x += 2) \
2192 switch (*(const unsigned short*)&pixelmask[x]) \
2195 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2196 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2198 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2201 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2202 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2204 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2207 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2208 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2210 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2215 for(;x < endx; x++) \
2218 if (!pixelmask[x]) \
2220 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2221 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2223 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2227 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2228 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2230 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2231 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2234 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2236 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2237 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2239 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2240 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2243 case DPSOFTRAST_BLENDMODE_ADD:
2244 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2246 case DPSOFTRAST_BLENDMODE_INVMOD:
2248 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2250 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2253 case DPSOFTRAST_BLENDMODE_MUL:
2254 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2256 case DPSOFTRAST_BLENDMODE_MUL2:
2257 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2259 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2261 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2262 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2264 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2265 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2268 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2270 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2271 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2273 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2274 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2277 case DPSOFTRAST_BLENDMODE_INVADD:
2279 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2281 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2288 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2291 int startx = span->startx;
2292 int endx = span->endx;
2297 float tc[2], endtc[2];
2299 unsigned int tci[2];
2300 unsigned int tci1[2];
2301 unsigned int tcimin[2];
2302 unsigned int tcimax[2];
2307 const unsigned char * RESTRICT pixelbase;
2308 const unsigned char * RESTRICT pixel[4];
2309 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2310 // if no texture is bound, just fill it with white
2313 for (x = startx;x < endx;x++)
2315 out4f[x*4+0] = 1.0f;
2316 out4f[x*4+1] = 1.0f;
2317 out4f[x*4+2] = 1.0f;
2318 out4f[x*4+3] = 1.0f;
2322 mip = triangle->mip[texunitindex];
2323 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2324 // if this mipmap of the texture is 1 pixel, just fill it with that color
2325 if (texture->mipmap[mip][1] == 4)
2327 c[0] = texture->bytes[2] * (1.0f/255.0f);
2328 c[1] = texture->bytes[1] * (1.0f/255.0f);
2329 c[2] = texture->bytes[0] * (1.0f/255.0f);
2330 c[3] = texture->bytes[3] * (1.0f/255.0f);
2331 for (x = startx;x < endx;x++)
2333 out4f[x*4+0] = c[0];
2334 out4f[x*4+1] = c[1];
2335 out4f[x*4+2] = c[2];
2336 out4f[x*4+3] = c[3];
2340 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2341 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2342 flags = texture->flags;
2343 tcscale[0] = texture->mipmap[mip][2];
2344 tcscale[1] = texture->mipmap[mip][3];
2345 tciwidth = texture->mipmap[mip][2];
2348 tcimax[0] = texture->mipmap[mip][2]-1;
2349 tcimax[1] = texture->mipmap[mip][3]-1;
2350 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2351 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2352 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2353 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2354 for (x = startx;x < endx;)
2356 unsigned int subtc[2];
2357 unsigned int substep[2];
2358 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2359 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2360 if (nextsub >= endx)
2362 nextsub = endsub = endx-1;
2363 if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2367 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2368 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2369 substep[0] = (endtc[0] - tc[0]) * subscale;
2370 substep[1] = (endtc[1] - tc[1]) * subscale;
2371 subtc[0] = tc[0] * (1<<16);
2372 subtc[1] = tc[1] * (1<<16);
2375 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2377 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2379 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2380 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2381 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2382 tci[0] = subtc[0]>>16;
2383 tci[1] = subtc[1]>>16;
2384 tci1[0] = tci[0] + 1;
2385 tci1[1] = tci[1] + 1;
2386 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2387 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2388 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2389 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2390 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2391 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2392 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2393 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2394 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2395 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2396 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2397 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2398 out4f[x*4+0] = c[0];
2399 out4f[x*4+1] = c[1];
2400 out4f[x*4+2] = c[2];
2401 out4f[x*4+3] = c[3];
2406 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2408 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2409 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2410 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2411 tci[0] = subtc[0]>>16;
2412 tci[1] = subtc[1]>>16;
2413 tci1[0] = tci[0] + 1;
2414 tci1[1] = tci[1] + 1;
2415 tci[0] &= tciwrapmask[0];
2416 tci[1] &= tciwrapmask[1];
2417 tci1[0] &= tciwrapmask[0];
2418 tci1[1] &= tciwrapmask[1];
2419 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2420 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2421 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2422 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2423 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2424 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2425 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2426 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2427 out4f[x*4+0] = c[0];
2428 out4f[x*4+1] = c[1];
2429 out4f[x*4+2] = c[2];
2430 out4f[x*4+3] = c[3];
2434 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2436 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2438 tci[0] = subtc[0]>>16;
2439 tci[1] = subtc[1]>>16;
2440 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2441 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2442 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2443 c[0] = pixel[0][2] * (1.0f / 255.0f);
2444 c[1] = pixel[0][1] * (1.0f / 255.0f);
2445 c[2] = pixel[0][0] * (1.0f / 255.0f);
2446 c[3] = pixel[0][3] * (1.0f / 255.0f);
2447 out4f[x*4+0] = c[0];
2448 out4f[x*4+1] = c[1];
2449 out4f[x*4+2] = c[2];
2450 out4f[x*4+3] = c[3];
2455 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2457 tci[0] = subtc[0]>>16;
2458 tci[1] = subtc[1]>>16;
2459 tci[0] &= tciwrapmask[0];
2460 tci[1] &= tciwrapmask[1];
2461 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2462 c[0] = pixel[0][2] * (1.0f / 255.0f);
2463 c[1] = pixel[0][1] * (1.0f / 255.0f);
2464 c[2] = pixel[0][0] * (1.0f / 255.0f);
2465 c[3] = pixel[0][3] * (1.0f / 255.0f);
2466 out4f[x*4+0] = c[0];
2467 out4f[x*4+1] = c[1];
2468 out4f[x*4+2] = c[2];
2469 out4f[x*4+3] = c[3];
2475 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2479 int startx = span->startx;
2480 int endx = span->endx;
2482 __m128 data, slope, tcscale;
2483 __m128i tcsize, tcmask, tcoffset, tcmax;
2485 __m128i subtc, substep, endsubtc;
2488 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2489 const unsigned char * RESTRICT pixelbase;
2490 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2491 // if no texture is bound, just fill it with white
2494 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2497 mip = triangle->mip[texunitindex];
2498 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2499 // if this mipmap of the texture is 1 pixel, just fill it with that color
2500 if (texture->mipmap[mip][1] == 4)
2502 unsigned int k = *((const unsigned int *)pixelbase);
2503 for (x = startx;x < endx;x++)
2507 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2508 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2509 flags = texture->flags;
2510 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2511 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2512 tcscale = _mm_cvtepi32_ps(tcsize);
2513 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2514 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2515 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2516 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2517 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2518 tcmax = _mm_packs_epi32(tcmask, tcmask);
2519 for (x = startx;x < endx;)
2521 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2522 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2523 if (nextsub >= endx)
2525 nextsub = endsub = endx-1;
2526 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2530 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2531 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2532 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2533 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2534 substep = _mm_slli_epi32(substep, 1);
2537 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2538 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2540 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2541 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2543 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2544 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2545 tci = _mm_madd_epi16(tci, tcoffset);
2546 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2547 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2548 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2549 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2550 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2551 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2552 fracm = _mm_srli_epi16(subtc, 1);
2553 pix1 = _mm_add_epi16(pix1,
2554 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2555 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2556 pix3 = _mm_add_epi16(pix3,
2557 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2558 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2559 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2560 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2561 pix2 = _mm_add_epi16(pix2,
2562 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2563 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2564 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2568 const unsigned char * RESTRICT ptr1;
2569 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2570 tci = _mm_madd_epi16(tci, tcoffset);
2571 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2572 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2573 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2574 fracm = _mm_srli_epi16(subtc, 1);
2575 pix1 = _mm_add_epi16(pix1,
2576 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2577 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2578 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2579 pix1 = _mm_add_epi16(pix1,
2580 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2581 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2582 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2586 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2588 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2590 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2591 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2592 tci = _mm_madd_epi16(tci, tcoffset);
2593 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2594 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2595 _mm_setzero_si128());
2596 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2597 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2598 _mm_setzero_si128());
2599 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2600 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2601 tci = _mm_madd_epi16(tci, tcoffset);
2602 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2603 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2604 _mm_setzero_si128());
2605 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2606 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2607 _mm_setzero_si128());
2608 fracm = _mm_srli_epi16(subtc, 1);
2609 pix1 = _mm_add_epi16(pix1,
2610 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2611 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2612 pix3 = _mm_add_epi16(pix3,
2613 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2614 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2615 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2616 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2617 pix2 = _mm_add_epi16(pix2,
2618 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2619 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2620 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2624 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2625 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2626 tci = _mm_madd_epi16(tci, tcoffset);
2627 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2628 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2629 _mm_setzero_si128());
2630 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2631 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2632 _mm_setzero_si128());
2633 fracm = _mm_srli_epi16(subtc, 1);
2634 pix1 = _mm_add_epi16(pix1,
2635 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2636 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2637 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2638 pix1 = _mm_add_epi16(pix1,
2639 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2640 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2641 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2647 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2649 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2650 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2651 tci = _mm_madd_epi16(tci, tcoffset);
2652 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2653 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2654 _mm_setzero_si128());
2655 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2656 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2657 _mm_setzero_si128());
2658 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2659 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2660 tci = _mm_madd_epi16(tci, tcoffset);
2661 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2662 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2663 _mm_setzero_si128());
2664 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2665 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2666 _mm_setzero_si128());
2667 fracm = _mm_srli_epi16(subtc, 1);
2668 pix1 = _mm_add_epi16(pix1,
2669 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2670 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2671 pix3 = _mm_add_epi16(pix3,
2672 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2673 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2674 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2675 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2676 pix2 = _mm_add_epi16(pix2,
2677 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2678 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2679 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2683 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2684 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2685 tci = _mm_madd_epi16(tci, tcoffset);
2686 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2687 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2688 _mm_setzero_si128());
2689 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2690 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2691 _mm_setzero_si128());
2692 fracm = _mm_srli_epi16(subtc, 1);
2693 pix1 = _mm_add_epi16(pix1,
2694 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2695 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2696 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2697 pix1 = _mm_add_epi16(pix1,
2698 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2699 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2700 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2707 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2709 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2711 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2712 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2713 tci = _mm_madd_epi16(tci, tcoffset);
2714 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2715 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2719 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2720 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2721 tci = _mm_madd_epi16(tci, tcoffset);
2722 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2728 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2730 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2731 tci = _mm_and_si128(tci, tcmax);
2732 tci = _mm_madd_epi16(tci, tcoffset);
2733 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2734 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2738 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2739 tci = _mm_and_si128(tci, tcmax);
2740 tci = _mm_madd_epi16(tci, tcoffset);
2741 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2750 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2753 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2756 float DPSOFTRAST_SampleShadowmap(const float *vector)
2762 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2765 int startx = span->startx;
2766 int endx = span->endx;
2771 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2772 for (x = startx;x < endx;x++)
2775 c[0] = (data[0] + slope[0]*x) * z;
2776 c[1] = (data[1] + slope[1]*x) * z;
2777 c[2] = (data[2] + slope[2]*x) * z;
2778 c[3] = (data[3] + slope[3]*x) * z;
2779 out4f[x*4+0] = in4f[x*4+0] * c[0];
2780 out4f[x*4+1] = in4f[x*4+1] * c[1];
2781 out4f[x*4+2] = in4f[x*4+2] * c[2];
2782 out4f[x*4+3] = in4f[x*4+3] * c[3];
2786 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2789 int startx = span->startx;
2790 int endx = span->endx;
2795 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2796 for (x = startx;x < endx;x++)
2799 c[0] = (data[0] + slope[0]*x) * z;
2800 c[1] = (data[1] + slope[1]*x) * z;
2801 c[2] = (data[2] + slope[2]*x) * z;
2802 c[3] = (data[3] + slope[3]*x) * z;
2803 out4f[x*4+0] = c[0];
2804 out4f[x*4+1] = c[1];
2805 out4f[x*4+2] = c[2];
2806 out4f[x*4+3] = c[3];
2810 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2812 int x, startx = span->startx, endx = span->endx;
2813 float c[4], localcolor[4];
2814 localcolor[0] = subcolor[0];
2815 localcolor[1] = subcolor[1];
2816 localcolor[2] = subcolor[2];
2817 localcolor[3] = subcolor[3];
2818 for (x = startx;x < endx;x++)
2820 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2821 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2822 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2823 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2824 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2825 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2826 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2827 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2831 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2833 int x, startx = span->startx, endx = span->endx;
2834 for (x = startx;x < endx;x++)
2836 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2837 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2838 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2839 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2843 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2845 int x, startx = span->startx, endx = span->endx;
2846 for (x = startx;x < endx;x++)
2848 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2849 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2850 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2851 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2855 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2857 int x, startx = span->startx, endx = span->endx;
2859 for (x = startx;x < endx;x++)
2861 a = 1.0f - inb4f[x*4+3];
2863 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2864 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2865 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2866 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2870 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2872 int x, startx = span->startx, endx = span->endx;
2873 float localcolor[4], ilerp, lerp;
2874 localcolor[0] = color[0];
2875 localcolor[1] = color[1];
2876 localcolor[2] = color[2];
2877 localcolor[3] = color[3];
2878 ilerp = 1.0f - localcolor[3];
2879 lerp = localcolor[3];
2880 for (x = startx;x < endx;x++)
2882 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2883 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2884 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2885 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2891 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2895 int startx = span->startx;
2896 int endx = span->endx;
2899 __m128i submod, substep, endsubmod;
2900 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2901 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2902 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2903 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2904 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2905 for (x = startx; x < endx;)
2907 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2908 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2909 if (nextsub >= endx)
2911 nextsub = endsub = endx-1;
2912 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2916 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2917 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2918 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2919 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2920 substep = _mm_packs_epi32(substep, substep);
2921 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2923 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2924 pix = _mm_mulhi_epu16(pix, submod);
2925 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2929 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2930 pix = _mm_mulhi_epu16(pix, submod);
2931 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2938 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2942 int startx = span->startx;
2943 int endx = span->endx;
2946 __m128i submod, substep, endsubmod;
2947 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2948 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2949 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2950 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2951 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2952 for (x = startx; x < endx;)
2954 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2955 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2956 if (nextsub >= endx)
2958 nextsub = endsub = endx-1;
2959 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2963 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2964 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2965 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2966 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2967 substep = _mm_packs_epi32(substep, substep);
2968 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2970 __m128i pix = _mm_srai_epi16(submod, 4);
2971 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2975 __m128i pix = _mm_srai_epi16(submod, 4);
2976 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2983 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2986 int x, startx = span->startx, endx = span->endx;
2987 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2988 localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2989 for (x = startx;x+2 <= endx;x+=2)
2991 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2992 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2993 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2994 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2998 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2999 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3000 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
3001 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3006 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3009 int x, startx = span->startx, endx = span->endx;
3010 for (x = startx;x+2 <= endx;x+=2)
3012 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3013 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3014 pix1 = _mm_mulhi_epu16(pix1, pix2);
3015 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3019 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3020 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3021 pix1 = _mm_mulhi_epu16(pix1, pix2);
3022 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3027 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3030 int x, startx = span->startx, endx = span->endx;
3031 for (x = startx;x+2 <= endx;x+=2)
3033 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3034 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3035 pix1 = _mm_add_epi16(pix1, pix2);
3036 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3040 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3041 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3042 pix1 = _mm_add_epi16(pix1, pix2);
3043 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3048 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3051 int x, startx = span->startx, endx = span->endx;
3052 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3053 tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
3054 for (x = startx;x+2 <= endx;x+=2)
3056 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3057 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3058 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3059 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3063 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3064 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3065 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3066 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3071 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3074 int x, startx = span->startx, endx = span->endx;
3075 for (x = startx;x+2 <= endx;x+=2)
3077 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3078 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3079 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3080 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3081 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3085 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3086 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3087 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3088 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3089 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3094 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3097 int x, startx = span->startx, endx = span->endx;
3098 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3099 localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
3100 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3101 for (x = startx;x+2 <= endx;x+=2)
3103 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3104 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3105 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3109 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3110 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3111 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3118 void DPSOFTRAST_VertexShader_Generic(void)
3120 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3121 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3122 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3123 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3124 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3127 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3129 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3130 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3131 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3132 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3133 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3134 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3136 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3137 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3138 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3140 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3141 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3144 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3146 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3149 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3151 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3154 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3159 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3160 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3165 void DPSOFTRAST_VertexShader_PostProcess(void)
3167 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3168 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3169 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3172 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3174 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3175 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3176 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3177 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3178 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3179 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3180 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3182 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3183 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3185 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3186 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3188 // TODO: implement saturation
3190 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3192 // TODO: implement gammaramps
3194 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3199 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3201 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3204 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3206 // this is never called (because colormask is off when this shader is used)
3207 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3208 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3209 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3210 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3211 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3216 void DPSOFTRAST_VertexShader_FlatColor(void)
3218 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3219 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3222 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3225 unsigned char * RESTRICT pixelmask = span->pixelmask;
3226 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3227 int x, startx = span->startx, endx = span->endx;
3228 __m128i Color_Ambientm;
3229 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3230 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3231 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3232 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3233 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3234 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3235 pixel = buffer_FragColorbgra8;
3236 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3237 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3238 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3239 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3240 for (x = startx;x < endx;x++)
3243 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3246 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3247 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3248 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3249 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3255 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3256 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3257 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3259 if (pixel == buffer_FragColorbgra8)
3260 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3266 void DPSOFTRAST_VertexShader_VertexColor(void)
3268 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3269 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3270 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3273 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3276 unsigned char * RESTRICT pixelmask = span->pixelmask;
3277 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3278 int x, startx = span->startx, endx = span->endx;
3279 __m128i Color_Ambientm, Color_Diffusem;
3281 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3282 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3283 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3284 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3285 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3286 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3287 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3288 pixel = buffer_FragColorbgra8;
3289 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3290 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3291 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3292 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3293 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3294 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3295 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3296 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3297 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3298 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3299 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3300 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3301 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3302 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3304 __m128i color, mod, pix;
3305 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3308 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3309 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3310 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3311 data = _mm_add_ps(data, slope);
3312 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3313 data = _mm_add_ps(data, slope);
3314 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3315 data = _mm_add_ps(data, slope);
3316 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3317 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3318 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3319 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3320 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3321 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3327 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3328 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3329 mod = _mm_packs_epi32(mod, mod);
3330 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3331 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3333 if (pixel == buffer_FragColorbgra8)
3334 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3340 void DPSOFTRAST_VertexShader_Lightmap(void)
3342 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3343 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3344 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3347 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3350 unsigned char * RESTRICT pixelmask = span->pixelmask;
3351 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3352 int x, startx = span->startx, endx = span->endx;
3353 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3354 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3355 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3356 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3357 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3358 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3359 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3360 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3361 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3362 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3363 pixel = buffer_FragColorbgra8;
3364 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3365 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3366 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3367 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3368 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3369 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3370 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3371 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3373 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3374 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3375 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3376 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3377 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3378 for (x = startx;x < endx;x++)
3380 __m128i color, lightmap, glow, pix;
3381 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3384 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3385 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3386 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3387 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3388 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3389 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3390 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3391 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3392 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3393 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3399 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3400 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3401 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3402 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3403 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3404 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3409 for (x = startx;x < endx;x++)
3411 __m128i color, lightmap, pix;
3412 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3415 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3416 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3417 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3418 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3419 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3420 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3421 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3427 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3428 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3429 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3430 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3433 if (pixel == buffer_FragColorbgra8)
3434 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3440 void DPSOFTRAST_VertexShader_FakeLight(void)
3442 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3445 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3448 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3449 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3450 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3451 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3452 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3457 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3459 DPSOFTRAST_VertexShader_Lightmap();
3462 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3464 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3470 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3472 DPSOFTRAST_VertexShader_Lightmap();
3475 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3477 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3483 void DPSOFTRAST_VertexShader_LightDirection(void)
3486 int numvertices = dpsoftrast.numvertices;
3488 float LightVector[4];
3489 float EyePosition[4];
3490 float EyeVectorModelSpace[4];
3496 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3497 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3498 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3499 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3500 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3501 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3502 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3503 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3504 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3505 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3506 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3507 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3508 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3509 for (i = 0;i < numvertices;i++)
3511 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3512 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3513 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3514 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3515 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3516 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3517 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3518 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3519 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3520 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3521 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3522 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3523 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3524 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3525 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3526 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3527 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3528 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3529 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3530 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3531 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3532 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3533 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3534 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3535 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3536 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3537 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3538 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3539 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3541 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3544 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3545 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3546 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3547 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3548 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3549 #define DPSOFTRAST_Vector3Normalize(v)\
3552 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3563 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3565 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3566 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3567 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3568 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3569 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3570 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3571 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3572 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3573 int x, startx = span->startx, endx = span->endx;
3574 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3575 float LightVectordata[4];
3576 float LightVectorslope[4];
3577 float EyeVectordata[4];
3578 float EyeVectorslope[4];
3580 float diffusetex[4];
3582 float surfacenormal[4];
3583 float lightnormal[4];
3585 float specularnormal[4];
3588 float SpecularPower;
3590 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3591 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3592 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3593 Color_Glow[3] = 0.0f;
3594 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3595 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3596 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3597 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3598 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3599 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3600 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3601 Color_Pants[3] = 0.0f;
3602 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3603 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3604 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3605 Color_Shirt[3] = 0.0f;
3606 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3607 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3608 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3610 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3611 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3613 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3615 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3617 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3619 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3620 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3621 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3622 Color_Diffuse[3] = 0.0f;
3623 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3624 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3625 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3626 LightColor[3] = 0.0f;
3627 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3628 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3629 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3630 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3631 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3632 Color_Specular[3] = 0.0f;
3633 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3634 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3635 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3636 for (x = startx;x < endx;x++)
3639 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3640 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3641 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3642 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3643 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3645 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3646 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3647 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3648 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3650 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3651 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3652 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3653 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3654 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3655 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3656 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3657 DPSOFTRAST_Vector3Normalize(surfacenormal);
3659 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3660 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3661 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3662 DPSOFTRAST_Vector3Normalize(lightnormal);
3664 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3665 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3666 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3667 DPSOFTRAST_Vector3Normalize(eyenormal);
3669 specularnormal[0] = lightnormal[0] + eyenormal[0];
3670 specularnormal[1] = lightnormal[1] + eyenormal[1];
3671 specularnormal[2] = lightnormal[2] + eyenormal[2];
3672 DPSOFTRAST_Vector3Normalize(specularnormal);
3674 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3675 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3676 specular = pow(specular, SpecularPower * glosstex[3]);
3677 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3679 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3680 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3681 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3682 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3686 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3687 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3688 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3689 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3691 buffer_FragColorbgra8[x*4+0] = d[0];
3692 buffer_FragColorbgra8[x*4+1] = d[1];
3693 buffer_FragColorbgra8[x*4+2] = d[2];
3694 buffer_FragColorbgra8[x*4+3] = d[3];
3697 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3699 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3700 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3701 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3702 Color_Diffuse[3] = 0.0f;
3703 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3704 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3705 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3706 LightColor[3] = 0.0f;
3707 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3708 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3709 for (x = startx;x < endx;x++)
3712 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3713 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3714 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3715 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3716 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3717 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3718 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3719 DPSOFTRAST_Vector3Normalize(surfacenormal);
3721 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3722 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3723 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3724 DPSOFTRAST_Vector3Normalize(lightnormal);
3726 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3727 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3729 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3730 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3731 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3732 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3736 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3737 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3738 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3739 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3741 buffer_FragColorbgra8[x*4+0] = d[0];
3742 buffer_FragColorbgra8[x*4+1] = d[1];
3743 buffer_FragColorbgra8[x*4+2] = d[2];
3744 buffer_FragColorbgra8[x*4+3] = d[3];
3749 for (x = startx;x < endx;x++)
3752 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3753 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3754 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3755 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3757 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3759 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3760 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3761 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3762 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3766 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3767 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3768 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3769 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3771 buffer_FragColorbgra8[x*4+0] = d[0];
3772 buffer_FragColorbgra8[x*4+1] = d[1];
3773 buffer_FragColorbgra8[x*4+2] = d[2];
3774 buffer_FragColorbgra8[x*4+3] = d[3];
3777 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3782 void DPSOFTRAST_VertexShader_LightSource(void)
3785 int numvertices = dpsoftrast.numvertices;
3786 float LightPosition[4];
3787 float LightVector[4];
3788 float LightVectorModelSpace[4];
3789 float EyePosition[4];
3790 float EyeVectorModelSpace[4];
3796 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3797 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3798 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3799 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3800 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3801 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3802 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3803 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3804 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3805 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3806 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3807 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3808 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3809 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3810 for (i = 0;i < numvertices;i++)
3812 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3813 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3814 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3815 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3816 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3817 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3818 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3819 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3820 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3821 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3822 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3823 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3824 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3825 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3826 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3827 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3828 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3829 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
3830 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3831 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3832 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3833 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3834 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3835 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3836 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3837 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3838 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3839 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3840 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3841 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3842 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3843 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3845 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3846 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3849 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3852 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3853 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3854 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3855 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3856 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3857 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3858 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3859 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3860 int x, startx = span->startx, endx = span->endx;
3861 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3862 float CubeVectordata[4];
3863 float CubeVectorslope[4];
3864 float LightVectordata[4];
3865 float LightVectorslope[4];
3866 float EyeVectordata[4];
3867 float EyeVectorslope[4];
3869 float diffusetex[4];
3871 float surfacenormal[4];
3872 float lightnormal[4];
3874 float specularnormal[4];
3877 float SpecularPower;
3878 float CubeVector[4];
3881 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3882 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3883 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3884 Color_Glow[3] = 0.0f;
3885 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3886 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3887 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3888 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3889 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3890 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3891 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3892 Color_Diffuse[3] = 0.0f;
3893 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3894 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3895 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3896 Color_Specular[3] = 0.0f;
3897 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3898 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3899 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3900 Color_Pants[3] = 0.0f;
3901 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3902 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3903 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3904 Color_Shirt[3] = 0.0f;
3905 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3906 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3907 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3908 LightColor[3] = 0.0f;
3909 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3910 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3911 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3912 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3913 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3914 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
3915 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3916 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3918 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3919 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3921 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3922 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
3923 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3925 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3926 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3927 for (x = startx;x < endx;x++)
3930 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3931 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3932 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3933 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3934 if (attenuation < 0.01f)
3936 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3938 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3939 if (attenuation < 0.01f)
3943 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3944 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3945 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3946 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3947 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3949 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3950 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3951 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3952 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3954 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3955 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3956 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3957 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3958 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3959 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3960 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3961 DPSOFTRAST_Vector3Normalize(surfacenormal);
3963 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3964 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3965 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3966 DPSOFTRAST_Vector3Normalize(lightnormal);
3968 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3969 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3970 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3971 DPSOFTRAST_Vector3Normalize(eyenormal);
3973 specularnormal[0] = lightnormal[0] + eyenormal[0];
3974 specularnormal[1] = lightnormal[1] + eyenormal[1];
3975 specularnormal[2] = lightnormal[2] + eyenormal[2];
3976 DPSOFTRAST_Vector3Normalize(specularnormal);
3978 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3979 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3980 specular = pow(specular, SpecularPower * glosstex[3]);
3981 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3983 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3984 attenuation *= (1.0f / 255.0f);
3985 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3986 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3987 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3988 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3992 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
3993 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
3994 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
3995 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3997 buffer_FragColorbgra8[x*4+0] = d[0];
3998 buffer_FragColorbgra8[x*4+1] = d[1];
3999 buffer_FragColorbgra8[x*4+2] = d[2];
4000 buffer_FragColorbgra8[x*4+3] = d[3];
4003 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4005 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4006 for (x = startx;x < endx;x++)
4009 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4010 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4011 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4012 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4013 if (attenuation < 0.01f)
4015 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4017 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4018 if (attenuation < 0.01f)
4022 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4023 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4024 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4025 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4026 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4028 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4029 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4030 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4031 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4033 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4034 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4035 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4036 DPSOFTRAST_Vector3Normalize(surfacenormal);
4038 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4039 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4040 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4041 DPSOFTRAST_Vector3Normalize(lightnormal);
4043 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4044 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4046 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4047 attenuation *= (1.0f / 255.0f);
4048 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4049 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4050 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4051 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4055 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4056 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4057 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4058 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4060 buffer_FragColorbgra8[x*4+0] = d[0];
4061 buffer_FragColorbgra8[x*4+1] = d[1];
4062 buffer_FragColorbgra8[x*4+2] = d[2];
4063 buffer_FragColorbgra8[x*4+3] = d[3];
4068 for (x = startx;x < endx;x++)
4071 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4072 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4073 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4074 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4075 if (attenuation < 0.01f)
4077 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4079 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4080 if (attenuation < 0.01f)
4084 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4085 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4086 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4087 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4088 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4090 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4091 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4092 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4093 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4095 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4097 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4098 attenuation *= (1.0f / 255.0f);
4099 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4100 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4101 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4102 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4106 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4107 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4108 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4109 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4111 buffer_FragColorbgra8[x*4+0] = d[0];
4112 buffer_FragColorbgra8[x*4+1] = d[1];
4113 buffer_FragColorbgra8[x*4+2] = d[2];
4114 buffer_FragColorbgra8[x*4+3] = d[3];
4117 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4123 void DPSOFTRAST_VertexShader_Refraction(void)
4125 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4128 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4131 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4132 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4133 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4134 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4135 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4140 void DPSOFTRAST_VertexShader_Water(void)
4142 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4146 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4149 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4150 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4151 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4152 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4153 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4158 void DPSOFTRAST_VertexShader_ShowDepth(void)
4160 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4163 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4166 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4167 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4168 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4169 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4170 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4175 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4177 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4180 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4183 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4184 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4185 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4186 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4187 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4192 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4194 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4197 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4200 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4201 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4202 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4203 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4204 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4209 typedef struct DPSOFTRAST_ShaderModeInfo_s
4212 void (*Vertex)(void);
4213 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4214 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4215 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4217 DPSOFTRAST_ShaderModeInfo;
4219 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4221 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4222 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4223 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4224 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4225 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4226 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4227 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {~0}, {~0}},
4228 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4229 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4230 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4231 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4232 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {~0}},
4233 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4234 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4235 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4236 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}}
4239 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4246 // unsigned int *colorpixel;
4247 unsigned int *depthpixel;
4253 DPSOFTRAST_State_Triangle *triangle;
4254 DPSOFTRAST_State_Span *span;
4255 unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4256 for (i = 0; i < thread->numspans; i++)
4258 span = &thread->spans[i];
4259 triangle = &thread->triangles[span->triangle];
4260 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4262 wslope = triangle->w[0];
4263 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4264 depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4265 depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4266 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4267 startx = span->startx;
4269 switch(thread->fb_depthfunc)
4272 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4273 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4274 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4275 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4276 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4277 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4278 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4280 //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4281 //for (x = startx;x < endx;x++)
4282 // colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4283 // if there is no color buffer, skip pixel shader
4284 while (startx < endx && !pixelmask[startx])
4286 while (endx > startx && !pixelmask[endx-1])
4289 continue; // no pixels to fill
4290 span->pixelmask = pixelmask;
4291 span->startx = startx;
4293 // run pixel shader if appropriate
4294 // do this before running depthmask code, to allow the pixelshader
4295 // to clear pixelmask values for alpha testing
4296 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4297 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4298 if (thread->depthmask)
4299 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4305 // no depth testing means we're just dealing with color...
4306 // if there is no color buffer, skip pixel shader
4307 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4309 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4310 span->pixelmask = pixelmask;
4311 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4315 thread->numspans = 0;
4318 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4320 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4323 int cullface = thread->cullface;
4324 int minx, maxx, miny, maxy;
4325 int miny1, maxy1, miny2, maxy2;
4326 __m128i fbmin, fbmax;
4327 __m128 viewportcenter, viewportscale;
4328 int firstvertex = command->firstvertex;
4329 int numvertices = command->numvertices;
4330 int numtriangles = command->numtriangles;
4331 const int *element3i = command->element3i;
4332 const unsigned short *element3s = command->element3s;
4333 int clipped = command->clipped;
4340 int starty, endy, bandy;
4344 __m128 triangleedge1, triangleedge2, trianglenormal;
4347 DPSOFTRAST_State_Triangle *triangle;
4348 DPSOFTRAST_Texture *texture;
4349 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4350 miny = thread->fb_scissor[1];
4351 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4352 miny1 = bound(miny, thread->miny1, maxy);
4353 maxy1 = bound(miny, thread->maxy1, maxy);
4354 miny2 = bound(miny, thread->miny2, maxy);
4355 maxy2 = bound(miny, thread->maxy2, maxy);
4356 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4358 if (!ATOMIC_DECREMENT(command->refcount))
4360 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4361 MM_FREE(command->arrays);
4365 minx = thread->fb_scissor[0];
4366 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4367 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4368 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4369 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4370 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4371 screen[3] = _mm_setzero_ps();
4372 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4373 for (i = 0;i < numtriangles;i++)
4375 const float *screencoord4f = command->arrays;
4376 const float *arrays = screencoord4f + numvertices*4;
4378 // generate the 3 edges of this triangle
4379 // generate spans for the triangle - switch based on left split or right split classification of triangle
4382 e[0] = element3s[i*3+0] - firstvertex;
4383 e[1] = element3s[i*3+1] - firstvertex;
4384 e[2] = element3s[i*3+2] - firstvertex;
4388 e[0] = element3i[i*3+0] - firstvertex;
4389 e[1] = element3i[i*3+1] - firstvertex;
4390 e[2] = element3i[i*3+2] - firstvertex;
4399 #define SKIPBACKFACE \
4400 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4401 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4402 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4403 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4404 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4408 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4412 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4417 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4418 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4420 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4421 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4423 #define CLIPPEDVERTEXCOPY(k,p1) \
4424 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4426 #define GENATTRIBCOPY(attrib, p1) \
4427 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4428 #define GENATTRIBLERP(attrib, p1, p2) \
4430 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4431 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4433 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4437 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4438 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4439 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4440 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4441 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4442 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4443 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4449 // calculate distance from nearplane
4450 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4451 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4452 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4453 if (clipdist[0] >= 0.0f)
4455 if (clipdist[1] >= 0.0f)
4457 if (clipdist[2] >= 0.0f)
4460 // triangle is entirely in front of nearplane
4461 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4468 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4476 if (clipdist[2] >= 0.0f)
4478 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4485 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4492 else if (clipdist[1] >= 0.0f)
4494 if (clipdist[2] >= 0.0f)
4496 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4503 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4509 else if (clipdist[2] >= 0.0f)
4511 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4516 else continue; // triangle is entirely behind nearplane
4519 // calculate integer y coords for triangle points
4520 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4521 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4522 screenmin = _mm_min_epi16(screeni, screenir),
4523 screenmax = _mm_max_epi16(screeni, screenir);
4524 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4525 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4526 screenmin = _mm_max_epi16(screenmin, fbmin);
4527 screenmax = _mm_min_epi16(screenmax, fbmax);
4528 // skip offscreen triangles
4529 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4531 starty = _mm_extract_epi16(screenmin, 1);
4532 endy = _mm_extract_epi16(screenmax, 1)+1;
4533 if (starty >= maxy1 && endy <= miny2)
4535 screeny = _mm_srai_epi32(screeni, 16);
4538 triangle = &thread->triangles[thread->numtriangles];
4540 // calculate attribute plans for triangle data...
4541 // okay, this triangle is going to produce spans, we'd better project
4542 // the interpolants now (this is what gives perspective texturing),
4543 // this consists of simply multiplying all arrays by the W coord
4544 // (which is basically 1/Z), which will be undone per-pixel
4545 // (multiplying by Z again) to get the perspective-correct array
4548 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4549 __m128 mipedgescale, mipdensity;
4550 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4551 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4552 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4553 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4554 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4555 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4556 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4557 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4558 attribedge1 = _mm_sub_ss(w0, w1);
4559 attribedge2 = _mm_sub_ss(w2, w1);
4560 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4561 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4562 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4563 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4564 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4565 _mm_store_ss(&triangle->w[0], attribxslope);
4566 _mm_store_ss(&triangle->w[1], attribyslope);
4567 _mm_store_ss(&triangle->w[2], attriborigin);
4568 mipedgescale = _mm_setzero_ps();
4569 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4571 __m128 attrib0, attrib1, attrib2;
4572 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4573 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4575 arrays += numvertices*4;
4576 GENATTRIBS(attrib0, attrib1, attrib2);
4577 attriborigin = _mm_mul_ps(attrib1, w1);
4578 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4579 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4580 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4581 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4582 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4583 _mm_stream_ps(triangle->attribs[k][0], attribxslope);
4584 _mm_stream_ps(triangle->attribs[k][1], attribyslope);
4585 _mm_stream_ps(triangle->attribs[k][2], attriborigin);
4586 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4588 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4589 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4590 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4591 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4595 memset(triangle->mip, 0, sizeof(triangle->mip));
4596 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4598 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4599 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4601 texture = thread->texbound[texunit];
4602 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4604 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4605 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4606 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4607 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4608 // this will be multiplied in the texturing routine by the texture resolution
4609 y = _mm_cvtss_si32(mipdensity);
4612 y = (int)(log((float)y)*0.5f/M_LN2);
4613 if (y > texture->mipmaps - 1)
4614 y = texture->mipmaps - 1;
4615 triangle->mip[texunit] = y;
4621 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4624 __m128 xcoords, xslope;
4625 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4626 int yccmask = _mm_movemask_epi8(ycc);
4627 int edge0p, edge0n, edge1p, edge1n;
4634 case 0xFFFF: /*0000*/ y = endy; continue;
4635 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4636 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4637 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4638 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4639 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4640 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4641 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4642 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4643 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4644 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4645 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4646 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4647 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4648 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4649 case 0x0000: /*1111*/ y++; continue;
4657 case 0xFFFF: /*000*/ y = endy; continue;
4658 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4659 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4660 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4661 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4662 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4663 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4664 case 0x0000: /*111*/ y++; continue;
4667 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
4668 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4669 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4670 nexty = _mm_extract_epi16(ycc, 0);
4671 if (nexty >= bandy) nexty = bandy-1;
4672 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
4673 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4674 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
4675 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4676 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4677 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
4679 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
4680 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
4682 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4684 int startx, endx, offset;
4685 startx = _mm_cvtss_si32(xcoords);
4686 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4689 if (startx < 0) startx = 0;
4690 startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
4692 if (endx > maxx) endx = maxx;
4693 if (startx >= endx) continue;
4694 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
4696 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4697 span->triangle = thread->numtriangles;
4700 span->startx = max(minx - offset, 0);
4701 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
4702 if (span->startx >= span->endx)
4704 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4705 DPSOFTRAST_Draw_ProcessSpans(thread);
4710 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
4712 DPSOFTRAST_Draw_ProcessSpans(thread);
4713 thread->numtriangles = 0;
4717 if (!ATOMIC_DECREMENT(command->refcount))
4719 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4720 MM_FREE(command->arrays);
4723 if (thread->numspans > 0 || thread->numtriangles > 0)
4725 DPSOFTRAST_Draw_ProcessSpans(thread);
4726 thread->numtriangles = 0;
4731 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4735 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
4736 int datasize = 2*numvertices*sizeof(float[4]);
4737 DPSOFTRAST_Command_Draw *command;
4738 unsigned char *data;
4739 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4741 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4742 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4744 datasize += numvertices*sizeof(float[4]);
4747 datasize += numtriangles*sizeof(unsigned short[3]);
4749 datasize += numtriangles*sizeof(int[3]);
4750 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
4751 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
4753 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
4754 data = (unsigned char *)MM_CALLOC(datasize, 1);
4758 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
4759 data = (unsigned char *)command + commandsize;
4761 command->firstvertex = firstvertex;
4762 command->numvertices = numvertices;
4763 command->numtriangles = numtriangles;
4764 command->arrays = (float *)data;
4765 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
4766 dpsoftrast.firstvertex = firstvertex;
4767 dpsoftrast.numvertices = numvertices;
4768 dpsoftrast.screencoord4f = (float *)data;
4769 data += numvertices*sizeof(float[4]);
4770 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
4771 data += numvertices*sizeof(float[4]);
4772 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4774 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4775 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4777 dpsoftrast.post_array4f[j] = (float *)data;
4778 data += numvertices*sizeof(float[4]);
4780 command->element3i = NULL;
4781 command->element3s = NULL;
4784 command->element3s = (unsigned short *)data;
4785 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
4789 command->element3i = (int *)data;
4790 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
4795 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4797 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
4798 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4799 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
4800 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
4801 if (command->starty >= command->endy)
4803 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4804 MM_FREE(command->arrays);
4805 DPSOFTRAST_UndoCommand(command->commandsize);
4808 command->clipped = dpsoftrast.drawclipped;
4809 command->refcount = dpsoftrast.numthreads;
4811 if (dpsoftrast.usethreads)
4814 DPSOFTRAST_Draw_SyncCommands();
4815 for (i = 0; i < dpsoftrast.numthreads; i++)
4817 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4818 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
4819 Thread_CondSignal(thread->drawcond);
4824 DPSOFTRAST_Draw_FlushThreads();
4828 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
4830 int commandoffset = thread->commandoffset;
4831 while (commandoffset != endoffset)
4833 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4834 switch (command->opcode)
4836 #define INTERPCOMMAND(name) \
4837 case DPSOFTRAST_OPCODE_##name : \
4838 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4839 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
4840 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4841 commandoffset = 0; \
4843 INTERPCOMMAND(Viewport)
4844 INTERPCOMMAND(ClearColor)
4845 INTERPCOMMAND(ClearDepth)
4846 INTERPCOMMAND(ColorMask)
4847 INTERPCOMMAND(DepthTest)
4848 INTERPCOMMAND(ScissorTest)
4849 INTERPCOMMAND(Scissor)
4850 INTERPCOMMAND(BlendFunc)
4851 INTERPCOMMAND(BlendSubtract)
4852 INTERPCOMMAND(DepthMask)
4853 INTERPCOMMAND(DepthFunc)
4854 INTERPCOMMAND(DepthRange)
4855 INTERPCOMMAND(PolygonOffset)
4856 INTERPCOMMAND(CullFace)
4857 INTERPCOMMAND(AlphaTest)
4858 INTERPCOMMAND(AlphaFunc)
4859 INTERPCOMMAND(SetTexture)
4860 INTERPCOMMAND(SetShader)
4861 INTERPCOMMAND(Uniform4f)
4862 INTERPCOMMAND(UniformMatrix4f)
4863 INTERPCOMMAND(Uniform1i)
4865 case DPSOFTRAST_OPCODE_Draw:
4866 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
4867 commandoffset += command->commandsize;
4868 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
4870 thread->commandoffset = commandoffset;
4873 case DPSOFTRAST_OPCODE_Reset:
4878 thread->commandoffset = commandoffset;
4881 static int DPSOFTRAST_Draw_Thread(void *data)
4883 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4884 while(thread->index >= 0)
4886 if (thread->commandoffset != dpsoftrast.drawcommand)
4888 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4892 Thread_LockMutex(thread->drawmutex);
4893 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
4895 if (thread->waiting) Thread_CondSignal(thread->waitcond);
4896 thread->starving = true;
4897 Thread_CondWait(thread->drawcond, thread->drawmutex);
4898 thread->starving = false;
4900 Thread_UnlockMutex(thread->drawmutex);
4906 static void DPSOFTRAST_Draw_FlushThreads(void)
4908 DPSOFTRAST_State_Thread *thread;
4910 DPSOFTRAST_Draw_SyncCommands();
4911 if (dpsoftrast.usethreads)
4913 for (i = 0; i < dpsoftrast.numthreads; i++)
4915 thread = &dpsoftrast.threads[i];
4916 if (thread->commandoffset != dpsoftrast.drawcommand)
4918 Thread_LockMutex(thread->drawmutex);
4919 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
4920 Thread_CondSignal(thread->drawcond);
4921 Thread_UnlockMutex(thread->drawmutex);
4924 for (i = 0; i < dpsoftrast.numthreads; i++)
4926 thread = &dpsoftrast.threads[i];
4927 if (thread->commandoffset != dpsoftrast.drawcommand)
4929 Thread_LockMutex(thread->drawmutex);
4930 if (thread->commandoffset != dpsoftrast.drawcommand)
4932 thread->waiting = true;
4933 Thread_CondWait(thread->waitcond, thread->drawmutex);
4934 thread->waiting = false;
4936 Thread_UnlockMutex(thread->drawmutex);
4942 for (i = 0; i < dpsoftrast.numthreads; i++)
4944 thread = &dpsoftrast.threads[i];
4945 if (thread->commandoffset != dpsoftrast.drawcommand)
4946 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4949 dpsoftrast.commandpool.usedcommands = 0;
4952 void DPSOFTRAST_Flush(void)
4954 DPSOFTRAST_Draw_FlushThreads();
4957 void DPSOFTRAST_Finish(void)
4962 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
4972 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4973 dpsoftrast.bigendian = u.b[3];
4974 dpsoftrast.fb_width = width;
4975 dpsoftrast.fb_height = height;
4976 dpsoftrast.fb_depthpixels = depthpixels;
4977 dpsoftrast.fb_colorpixels[0] = colorpixels;
4978 dpsoftrast.fb_colorpixels[1] = NULL;
4979 dpsoftrast.fb_colorpixels[1] = NULL;
4980 dpsoftrast.fb_colorpixels[1] = NULL;
4981 dpsoftrast.viewport[0] = 0;
4982 dpsoftrast.viewport[1] = 0;
4983 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
4984 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
4985 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
4986 dpsoftrast.texture_firstfree = 1;
4987 dpsoftrast.texture_end = 1;
4988 dpsoftrast.texture_max = 0;
4989 dpsoftrast.color[0] = 1;
4990 dpsoftrast.color[1] = 1;
4991 dpsoftrast.color[2] = 1;
4992 dpsoftrast.color[3] = 1;
4993 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
4994 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
4995 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
4996 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
4997 for (i = 0; i < dpsoftrast.numthreads; i++)
4999 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5001 thread->cullface = GL_BACK;
5002 thread->colormask[1] = 1;
5003 thread->colormask[2] = 1;
5004 thread->colormask[3] = 1;
5005 thread->blendfunc[0] = GL_ONE;
5006 thread->blendfunc[1] = GL_ZERO;
5007 thread->depthmask = true;
5008 thread->depthtest = true;
5009 thread->depthfunc = GL_LEQUAL;
5010 thread->scissortest = false;
5011 thread->alphatest = false;
5012 thread->alphafunc = GL_GREATER;
5013 thread->alphavalue = 0.5f;
5014 thread->viewport[0] = 0;
5015 thread->viewport[1] = 0;
5016 thread->viewport[2] = dpsoftrast.fb_width;
5017 thread->viewport[3] = dpsoftrast.fb_height;
5018 thread->scissor[0] = 0;
5019 thread->scissor[1] = 0;
5020 thread->scissor[2] = dpsoftrast.fb_width;
5021 thread->scissor[3] = dpsoftrast.fb_height;
5022 thread->depthrange[0] = 0;
5023 thread->depthrange[1] = 1;
5024 thread->polygonoffset[0] = 0;
5025 thread->polygonoffset[1] = 0;
5027 if (dpsoftrast.interlace)
5029 thread->miny1 = (i*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5030 thread->maxy1 = ((i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5031 thread->miny2 = ((dpsoftrast.numthreads+i)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5032 thread->maxy2 = ((dpsoftrast.numthreads+i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5036 thread->miny1 = thread->miny2 = (i*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5037 thread->maxy1 = thread->maxy2 = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5040 thread->numspans = 0;
5041 thread->numtriangles = 0;
5042 thread->commandoffset = 0;
5043 thread->waiting = false;
5044 thread->starving = false;
5046 thread->validate = -1;
5047 DPSOFTRAST_Validate(thread, -1);
5049 if (dpsoftrast.usethreads)
5051 thread->waitcond = Thread_CreateCond();
5052 thread->drawcond = Thread_CreateCond();
5053 thread->drawmutex = Thread_CreateMutex();
5054 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5060 void DPSOFTRAST_Shutdown(void)
5063 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5065 DPSOFTRAST_State_Thread *thread;
5066 for (i = 0; i < dpsoftrast.numthreads; i++)
5068 thread = &dpsoftrast.threads[i];
5069 Thread_LockMutex(thread->drawmutex);
5071 Thread_CondSignal(thread->drawcond);
5072 Thread_UnlockMutex(thread->drawmutex);
5073 Thread_WaitThread(thread->thread, 0);
5074 Thread_DestroyCond(thread->waitcond);
5075 Thread_DestroyCond(thread->drawcond);
5076 Thread_DestroyMutex(thread->drawmutex);
5079 for (i = 0;i < dpsoftrast.texture_end;i++)
5080 if (dpsoftrast.texture[i].bytes)
5081 MM_FREE(dpsoftrast.texture[i].bytes);
5082 if (dpsoftrast.texture)
5083 free(dpsoftrast.texture);
5084 if (dpsoftrast.threads)
5085 MM_FREE(dpsoftrast.threads);
5086 memset(&dpsoftrast, 0, sizeof(dpsoftrast));