3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 typedef qboolean bool;
14 #define ATOMIC_SIZE 32
18 #define ALIGN(var) var __attribute__((__aligned__(16)))
19 #define ATOMIC(var) var __attribute__((__aligned__(32)))
20 #define MEMORY_BARRIER (_mm_sfence())
21 //(__sync_synchronize())
22 #define ATOMIC_COUNTER volatile int
23 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
24 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
25 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
26 #elif defined(_MSC_VER)
27 #define ALIGN(var) __declspec(align(16)) var
28 #define ATOMIC(var) __declspec(align(32)) var
29 #define MEMORY_BARRIER (_mm_sfence())
31 #define ATOMIC_COUNTER volatile LONG
32 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
33 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
34 #define ATOMIC_ADD(counter, val) (InterlockedExchangeAdd(&(counter), (val)))
39 #define ALIGN(var) var
42 #define ATOMIC(var) var
44 #ifndef MEMORY_BARRIER
45 #define MEMORY_BARRIER ((void)0)
47 #ifndef ATOMIC_COUNTER
48 #define ATOMIC_COUNTER int
50 #ifndef ATOMIC_INCREMENT
51 #define ATOMIC_INCREMENT(counter) (++(counter))
53 #ifndef ATOMIC_DECREMENT
54 #define ATOMIC_DECREMENT(counter) (--(counter))
57 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
61 #include <emmintrin.h>
63 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
65 static void *MM_CALLOC(size_t nmemb, size_t size)
67 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
68 if (ptr != NULL) memset(ptr, 0, nmemb*size);
72 #define MM_FREE _mm_free
74 #define MM_MALLOC(size) malloc(size)
75 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
79 typedef enum DPSOFTRAST_ARRAY_e
81 DPSOFTRAST_ARRAY_POSITION,
82 DPSOFTRAST_ARRAY_COLOR,
83 DPSOFTRAST_ARRAY_TEXCOORD0,
84 DPSOFTRAST_ARRAY_TEXCOORD1,
85 DPSOFTRAST_ARRAY_TEXCOORD2,
86 DPSOFTRAST_ARRAY_TEXCOORD3,
87 DPSOFTRAST_ARRAY_TEXCOORD4,
88 DPSOFTRAST_ARRAY_TEXCOORD5,
89 DPSOFTRAST_ARRAY_TEXCOORD6,
90 DPSOFTRAST_ARRAY_TEXCOORD7,
91 DPSOFTRAST_ARRAY_TOTAL
95 typedef struct DPSOFTRAST_Texture_s
102 DPSOFTRAST_TEXTURE_FILTER filter;
105 ATOMIC_COUNTER binds;
106 unsigned char *bytes;
107 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
111 #define COMMAND_SIZE ALIGN_SIZE
112 #define COMMAND_ALIGN(var) ALIGN(var)
114 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
116 unsigned char opcode;
117 unsigned short commandsize;
121 enum { DPSOFTRAST_OPCODE_Reset = 0 };
123 #define DEFCOMMAND(opcodeval, name, fields) \
124 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
125 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
127 unsigned char opcode; \
128 unsigned short commandsize; \
130 } DPSOFTRAST_Command_##name );
132 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
133 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
135 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
139 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
141 DPSOFTRAST_State_Command_Pool);
143 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
145 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
147 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
149 DPSOFTRAST_State_Triangle);
151 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
152 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
153 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
154 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
155 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
157 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
158 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
159 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
160 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
161 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
162 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
163 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
164 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
165 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
168 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
170 typedef ALIGN(struct DPSOFTRAST_State_Span_s
172 int triangle; // triangle this span was generated by
173 int x; // framebuffer x coord
174 int y; // framebuffer y coord
175 int startx; // usable range (according to pixelmask)
176 int endx; // usable range (according to pixelmask)
177 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
179 DPSOFTRAST_State_Span);
181 #define DPSOFTRAST_DRAW_MAXSPANS 1024
182 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
184 #define DPSOFTRAST_VALIDATE_FB 1
185 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
186 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
187 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
189 typedef enum DPSOFTRAST_BLENDMODE_e
191 DPSOFTRAST_BLENDMODE_OPAQUE,
192 DPSOFTRAST_BLENDMODE_ALPHA,
193 DPSOFTRAST_BLENDMODE_ADDALPHA,
194 DPSOFTRAST_BLENDMODE_ADD,
195 DPSOFTRAST_BLENDMODE_INVMOD,
196 DPSOFTRAST_BLENDMODE_MUL,
197 DPSOFTRAST_BLENDMODE_MUL2,
198 DPSOFTRAST_BLENDMODE_SUBALPHA,
199 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
200 DPSOFTRAST_BLENDMODE_INVADD,
201 DPSOFTRAST_BLENDMODE_TOTAL
203 DPSOFTRAST_BLENDMODE;
205 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
224 float polygonoffset[2];
227 int shader_permutation;
229 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
231 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
232 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
234 // DPSOFTRAST_VALIDATE_ flags
237 // derived values (DPSOFTRAST_VALIDATE_FB)
240 ALIGN(float fb_viewportcenter[4]);
241 ALIGN(float fb_viewportscale[4]);
243 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
246 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
255 ATOMIC(volatile int commandoffset);
257 volatile bool waiting;
258 volatile bool starving;
265 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
266 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
268 DPSOFTRAST_State_Thread);
270 typedef ATOMIC(struct DPSOFTRAST_State_s
274 unsigned int *fb_depthpixels;
275 unsigned int *fb_colorpixels[4];
278 ALIGN(float fb_viewportcenter[4]);
279 ALIGN(float fb_viewportscale[4]);
282 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
283 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
285 const float *pointer_vertex3f;
286 const float *pointer_color4f;
287 const unsigned char *pointer_color4ub;
288 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
291 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
292 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
293 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
297 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
298 float *screencoord4f;
304 int shader_permutation;
308 int texture_firstfree;
309 DPSOFTRAST_Texture *texture;
314 const char *errorstring;
319 DPSOFTRAST_State_Thread *threads;
321 ATOMIC(volatile int drawcommand);
323 DPSOFTRAST_State_Command_Pool commandpool;
327 DPSOFTRAST_State dpsoftrast;
329 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
330 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
331 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
332 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
333 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
335 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
337 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
338 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
339 fb_viewportcenter[3] = 0.5f;
340 fb_viewportcenter[0] = 0.0f;
341 fb_viewportscale[1] = 0.5f * viewport[2];
342 fb_viewportscale[2] = -0.5f * viewport[3];
343 fb_viewportscale[3] = 0.5f;
344 fb_viewportscale[0] = 1.0f;
347 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
349 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
350 // and viewport projection values
353 x1 = thread->scissor[0];
354 x2 = thread->scissor[0] + thread->scissor[2];
355 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
356 y2 = dpsoftrast.fb_height - thread->scissor[1];
357 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
359 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
361 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
362 thread->fb_scissor[0] = x1;
363 thread->fb_scissor[1] = y1;
364 thread->fb_scissor[2] = x2 - x1;
365 thread->fb_scissor[3] = y2 - y1;
367 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
370 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
372 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
375 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
377 if (thread->blendsubtract)
379 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
381 #define BLENDFUNC(sfactor, dfactor, blendmode) \
382 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
383 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
384 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
389 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
391 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
392 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
393 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
394 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
395 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
396 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
397 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
398 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
399 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
400 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
401 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
406 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
408 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
410 mask &= thread->validate;
413 if (mask & DPSOFTRAST_VALIDATE_FB)
415 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
416 DPSOFTRAST_RecalcFB(thread);
418 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
420 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
421 DPSOFTRAST_RecalcDepthFunc(thread);
423 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
425 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
426 DPSOFTRAST_RecalcBlendFunc(thread);
430 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
432 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
433 return &dpsoftrast.texture[index];
437 static void DPSOFTRAST_Texture_Grow(void)
439 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
440 DPSOFTRAST_State_Thread *thread;
444 // expand texture array as needed
445 if (dpsoftrast.texture_max < 1024)
446 dpsoftrast.texture_max = 1024;
448 dpsoftrast.texture_max *= 2;
449 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
450 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
451 if (dpsoftrast.texbound[i])
452 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
453 for (j = 0; j < dpsoftrast.numthreads; j++)
455 thread = &dpsoftrast.threads[j];
456 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
457 if (thread->texbound[i])
458 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
462 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
471 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
472 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
473 DPSOFTRAST_Texture *texture;
474 if (width*height*depth < 1)
476 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
479 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
481 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
486 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
487 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
488 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
490 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
491 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
493 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
498 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
501 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
503 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
508 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
510 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
513 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
515 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
518 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
520 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
523 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
525 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
528 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
530 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
533 // find first empty slot in texture array
534 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
535 if (!dpsoftrast.texture[texnum].bytes)
537 dpsoftrast.texture_firstfree = texnum + 1;
538 if (dpsoftrast.texture_max <= texnum)
539 DPSOFTRAST_Texture_Grow();
540 if (dpsoftrast.texture_end <= texnum)
541 dpsoftrast.texture_end = texnum + 1;
542 texture = &dpsoftrast.texture[texnum];
543 memset(texture, 0, sizeof(*texture));
544 texture->flags = flags;
545 texture->width = width;
546 texture->height = height;
547 texture->depth = depth;
548 texture->sides = sides;
560 s = w * h * d * sides * 4;
561 texture->mipmap[mipmaps][0] = size;
562 texture->mipmap[mipmaps][1] = s;
563 texture->mipmap[mipmaps][2] = w;
564 texture->mipmap[mipmaps][3] = h;
565 texture->mipmap[mipmaps][4] = d;
568 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
574 texture->mipmaps = mipmaps;
575 texture->size = size;
577 // allocate the pixels now
578 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
582 void DPSOFTRAST_Texture_Free(int index)
584 DPSOFTRAST_Texture *texture;
585 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
589 MM_FREE(texture->bytes);
590 texture->bytes = NULL;
591 memset(texture, 0, sizeof(*texture));
592 // adjust the free range and used range
593 if (dpsoftrast.texture_firstfree > index)
594 dpsoftrast.texture_firstfree = index;
595 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
596 dpsoftrast.texture_end--;
598 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
600 int i, x, y, z, w, layer0, layer1, row0, row1;
601 unsigned char *o, *i0, *i1, *i2, *i3;
602 DPSOFTRAST_Texture *texture;
603 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
604 if (texture->mipmaps <= 1)
606 for (i = 1;i < texture->mipmaps;i++)
608 for (z = 0;z < texture->mipmap[i][4];z++)
612 if (layer1 >= texture->mipmap[i-1][4])
613 layer1 = texture->mipmap[i-1][4]-1;
614 for (y = 0;y < texture->mipmap[i][3];y++)
618 if (row1 >= texture->mipmap[i-1][3])
619 row1 = texture->mipmap[i-1][3]-1;
620 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
621 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
622 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
623 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
624 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
625 w = texture->mipmap[i][2];
628 if (texture->mipmap[i-1][2] > 1)
630 // average 3D texture
631 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
633 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
634 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
635 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
636 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
641 // average 3D mipmap with parent width == 1
642 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
644 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
645 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
646 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
647 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
653 if (texture->mipmap[i-1][2] > 1)
655 // average 2D texture (common case)
656 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
658 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
659 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
660 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
661 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
666 // 2D texture with parent width == 1
667 o[0] = (i0[0] + i1[0] + 1) >> 1;
668 o[1] = (i0[1] + i1[1] + 1) >> 1;
669 o[2] = (i0[2] + i1[2] + 1) >> 1;
670 o[3] = (i0[3] + i1[3] + 1) >> 1;
677 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
679 DPSOFTRAST_Texture *texture;
681 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
684 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
685 while (blockheight > 0)
687 memcpy(dst, pixels, blockwidth * 4);
688 pixels += blockwidth * 4;
689 dst += texture->mipmap[0][2] * 4;
692 DPSOFTRAST_Texture_CalculateMipmaps(index);
694 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
696 DPSOFTRAST_Texture *texture;
697 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
700 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
701 DPSOFTRAST_Texture_CalculateMipmaps(index);
703 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
705 DPSOFTRAST_Texture *texture;
706 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
707 return texture->mipmap[mip][2];
709 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
711 DPSOFTRAST_Texture *texture;
712 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
713 return texture->mipmap[mip][3];
715 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
717 DPSOFTRAST_Texture *texture;
718 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
719 return texture->mipmap[mip][4];
721 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
723 DPSOFTRAST_Texture *texture;
724 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
727 return texture->bytes + texture->mipmap[mip][0];
729 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
731 DPSOFTRAST_Texture *texture;
732 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
733 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
735 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
740 texture->filter = filter;
743 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
745 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
746 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
747 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
749 dpsoftrast.fb_width = width;
750 dpsoftrast.fb_height = height;
751 dpsoftrast.fb_depthpixels = depthpixels;
752 dpsoftrast.fb_colorpixels[0] = colorpixels0;
753 dpsoftrast.fb_colorpixels[1] = colorpixels1;
754 dpsoftrast.fb_colorpixels[2] = colorpixels2;
755 dpsoftrast.fb_colorpixels[3] = colorpixels3;
758 static void DPSOFTRAST_Draw_FlushThreads(void);
760 static void DPSOFTRAST_Draw_SyncCommands(void)
762 if(dpsoftrast.usethreads) MEMORY_BARRIER;
763 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
766 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
768 DPSOFTRAST_State_Thread *thread;
770 int freecommand = dpsoftrast.commandpool.freecommand;
771 int usedcommands = dpsoftrast.commandpool.usedcommands;
772 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
774 DPSOFTRAST_Draw_SyncCommands();
780 for (i = 0; i < dpsoftrast.numthreads; i++)
782 thread = &dpsoftrast.threads[i];
783 commandoffset = freecommand - thread->commandoffset;
784 if (commandoffset < 0)
785 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
786 if (commandoffset > usedcommands)
789 usedcommands = commandoffset;
792 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
794 thread = &dpsoftrast.threads[waitindex];
795 Thread_LockMutex(thread->drawmutex);
796 if (thread->commandoffset != dpsoftrast.drawcommand)
798 thread->waiting = true;
799 if (thread->starving) Thread_CondSignal(thread->drawcond);
800 Thread_CondWait(thread->waitcond, thread->drawmutex);
801 thread->waiting = false;
803 Thread_UnlockMutex(thread->drawmutex);
805 dpsoftrast.commandpool.usedcommands = usedcommands;
808 #define DPSOFTRAST_ALIGNCOMMAND(size) \
809 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
810 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
811 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
813 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
815 DPSOFTRAST_Command *command;
816 int freecommand = dpsoftrast.commandpool.freecommand;
817 int usedcommands = dpsoftrast.commandpool.usedcommands;
818 int extra = sizeof(DPSOFTRAST_Command);
819 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
820 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
821 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
823 if (dpsoftrast.usethreads)
824 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
826 DPSOFTRAST_Draw_FlushThreads();
827 freecommand = dpsoftrast.commandpool.freecommand;
828 usedcommands = dpsoftrast.commandpool.usedcommands;
830 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
832 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
833 command->opcode = DPSOFTRAST_OPCODE_Reset;
834 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
837 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
838 command->opcode = opcode;
839 command->commandsize = size;
841 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
843 dpsoftrast.commandpool.freecommand = freecommand;
844 dpsoftrast.commandpool.usedcommands = usedcommands + size;
848 static void DPSOFTRAST_UndoCommand(int size)
850 int freecommand = dpsoftrast.commandpool.freecommand;
851 int usedcommands = dpsoftrast.commandpool.usedcommands;
854 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
855 usedcommands -= size;
856 dpsoftrast.commandpool.freecommand = freecommand;
857 dpsoftrast.commandpool.usedcommands = usedcommands;
860 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
861 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
863 thread->viewport[0] = command->x;
864 thread->viewport[1] = command->y;
865 thread->viewport[2] = command->width;
866 thread->viewport[3] = command->height;
867 thread->validate |= DPSOFTRAST_VALIDATE_FB;
869 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
871 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
874 command->width = width;
875 command->height = height;
877 dpsoftrast.viewport[0] = x;
878 dpsoftrast.viewport[1] = y;
879 dpsoftrast.viewport[2] = width;
880 dpsoftrast.viewport[3] = height;
881 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
884 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
885 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
887 int i, x1, y1, x2, y2, w, h, x, y;
888 int miny1 = thread->miny1;
889 int maxy1 = thread->maxy1;
890 int miny2 = thread->miny2;
891 int maxy2 = thread->maxy2;
895 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
896 x1 = thread->fb_scissor[0];
897 y1 = thread->fb_scissor[1];
898 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
899 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
900 if (y1 < miny1) y1 = miny1;
901 if (y2 > maxy2) y2 = maxy2;
906 // FIXME: honor fb_colormask?
907 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
908 for (i = 0;i < 4;i++)
910 if (!dpsoftrast.fb_colorpixels[i])
912 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
915 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
916 for (x = x1;x < x2;x++)
921 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
923 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
930 DEFCOMMAND(3, ClearDepth, float depth;)
931 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
933 int x1, y1, x2, y2, w, h, x, y;
934 int miny1 = thread->miny1;
935 int maxy1 = thread->maxy1;
936 int miny2 = thread->miny2;
937 int maxy2 = thread->maxy2;
941 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
942 x1 = thread->fb_scissor[0];
943 y1 = thread->fb_scissor[1];
944 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
945 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
946 if (y1 < miny1) y1 = miny1;
947 if (y2 > maxy2) y2 = maxy2;
952 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
953 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
956 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
957 for (x = x1;x < x2;x++)
961 void DPSOFTRAST_ClearDepth(float d)
963 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
967 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
968 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
970 thread->colormask[0] = command->r != 0;
971 thread->colormask[1] = command->g != 0;
972 thread->colormask[2] = command->b != 0;
973 thread->colormask[3] = command->a != 0;
974 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
976 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
978 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
985 DEFCOMMAND(5, DepthTest, int enable;)
986 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
988 thread->depthtest = command->enable;
989 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
991 void DPSOFTRAST_DepthTest(int enable)
993 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
994 command->enable = enable;
997 DEFCOMMAND(6, ScissorTest, int enable;)
998 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1000 thread->scissortest = command->enable;
1001 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1003 void DPSOFTRAST_ScissorTest(int enable)
1005 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1006 command->enable = enable;
1009 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1010 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1012 thread->scissor[0] = command->x;
1013 thread->scissor[1] = command->y;
1014 thread->scissor[2] = command->width;
1015 thread->scissor[3] = command->height;
1016 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1018 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1020 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1023 command->width = width;
1024 command->height = height;
1027 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1028 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1030 thread->blendfunc[0] = command->sfactor;
1031 thread->blendfunc[1] = command->dfactor;
1032 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1034 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1036 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1037 command->sfactor = sfactor;
1038 command->dfactor = dfactor;
1041 DEFCOMMAND(9, BlendSubtract, int enable;)
1042 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1044 thread->blendsubtract = command->enable;
1045 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1047 void DPSOFTRAST_BlendSubtract(int enable)
1049 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1050 command->enable = enable;
1053 DEFCOMMAND(10, DepthMask, int enable;)
1054 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1056 thread->depthmask = command->enable;
1058 void DPSOFTRAST_DepthMask(int enable)
1060 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1061 command->enable = enable;
1064 DEFCOMMAND(11, DepthFunc, int func;)
1065 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1067 thread->depthfunc = command->func;
1069 void DPSOFTRAST_DepthFunc(int func)
1071 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1072 command->func = func;
1075 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1076 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1078 thread->depthrange[0] = command->nearval;
1079 thread->depthrange[1] = command->farval;
1081 void DPSOFTRAST_DepthRange(float nearval, float farval)
1083 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1084 command->nearval = nearval;
1085 command->farval = farval;
1088 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1089 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1091 thread->polygonoffset[0] = command->alongnormal;
1092 thread->polygonoffset[1] = command->intoview;
1094 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1096 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1097 command->alongnormal = alongnormal;
1098 command->intoview = intoview;
1101 DEFCOMMAND(14, CullFace, int mode;)
1102 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1104 thread->cullface = command->mode;
1106 void DPSOFTRAST_CullFace(int mode)
1108 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1109 command->mode = mode;
1112 DEFCOMMAND(15, AlphaTest, int enable;)
1113 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1115 thread->alphatest = command->enable;
1117 void DPSOFTRAST_AlphaTest(int enable)
1119 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1120 command->enable = enable;
1123 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1124 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1126 thread->alphafunc = command->func;
1127 thread->alphavalue = command->ref;
1129 void DPSOFTRAST_AlphaFunc(int func, float ref)
1131 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1132 command->func = func;
1136 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1138 dpsoftrast.color[0] = r;
1139 dpsoftrast.color[1] = g;
1140 dpsoftrast.color[2] = b;
1141 dpsoftrast.color[3] = a;
1144 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1146 int outstride = blockwidth * 4;
1147 int instride = dpsoftrast.fb_width * 4;
1150 int bx2 = blockx + blockwidth;
1151 int by2 = blocky + blockheight;
1156 unsigned char *inpixels;
1160 if (bx1 < 0) bx1 = 0;
1161 if (by1 < 0) by1 = 0;
1162 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1163 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1166 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1167 if (dpsoftrast.bigendian)
1169 for (y = by1;y < by2;y++)
1171 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1172 o = (unsigned char *)outpixels + (y - by1) * outstride;
1173 for (x = bx1;x < bx2;x++)
1186 for (y = by1;y < by2;y++)
1188 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1189 o = (unsigned char *)outpixels + (y - by1) * outstride;
1195 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1199 int tx2 = tx + width;
1200 int ty2 = ty + height;
1203 int sx2 = sx + width;
1204 int sy2 = sy + height;
1214 unsigned int *spixels;
1215 unsigned int *tpixels;
1216 DPSOFTRAST_Texture *texture;
1217 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1218 if (mip < 0 || mip >= texture->mipmaps) return;
1221 spixels = dpsoftrast.fb_colorpixels[0];
1222 swidth = dpsoftrast.fb_width;
1223 sheight = dpsoftrast.fb_height;
1224 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1225 twidth = texture->mipmap[mip][2];
1226 theight = texture->mipmap[mip][3];
1227 if (tx1 < 0) tx1 = 0;
1228 if (ty1 < 0) ty1 = 0;
1229 if (tx2 > twidth) tx2 = twidth;
1230 if (ty2 > theight) ty2 = theight;
1231 if (sx1 < 0) sx1 = 0;
1232 if (sy1 < 0) sy1 = 0;
1233 if (sx2 > swidth) sx2 = swidth;
1234 if (sy2 > sheight) sy2 = sheight;
1239 if (tw > sw) tw = sw;
1240 if (th > sh) th = sh;
1241 if (tw < 1 || th < 1)
1243 for (y = 0;y < th;y++)
1244 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1245 if (texture->mipmaps > 1)
1246 DPSOFTRAST_Texture_CalculateMipmaps(index);
1249 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1250 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1252 if (thread->texbound[command->unitnum])
1253 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1254 thread->texbound[command->unitnum] = command->texture;
1256 void DPSOFTRAST_SetTexture(int unitnum, int index)
1258 DPSOFTRAST_Command_SetTexture *command;
1259 DPSOFTRAST_Texture *texture;
1260 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1262 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1265 texture = DPSOFTRAST_Texture_GetByIndex(index);
1266 if (index && !texture)
1268 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1272 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1273 command->unitnum = unitnum;
1274 command->texture = texture;
1276 dpsoftrast.texbound[unitnum] = texture;
1277 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1280 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1282 dpsoftrast.pointer_vertex3f = vertex3f;
1283 dpsoftrast.stride_vertex = stride;
1285 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1287 dpsoftrast.pointer_color4f = color4f;
1288 dpsoftrast.pointer_color4ub = NULL;
1289 dpsoftrast.stride_color = stride;
1291 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1293 dpsoftrast.pointer_color4f = NULL;
1294 dpsoftrast.pointer_color4ub = color4ub;
1295 dpsoftrast.stride_color = stride;
1297 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1299 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1300 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1301 dpsoftrast.stride_texcoord[unitnum] = stride;
1304 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1305 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1307 thread->shader_mode = command->mode;
1308 thread->shader_permutation = command->permutation;
1310 void DPSOFTRAST_SetShader(int mode, int permutation)
1312 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1313 command->mode = mode;
1314 command->permutation = permutation;
1316 dpsoftrast.shader_mode = mode;
1317 dpsoftrast.shader_permutation = permutation;
1320 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1321 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1323 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1325 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1327 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1328 command->index = index;
1329 command->val[0] = v0;
1330 command->val[1] = v1;
1331 command->val[2] = v2;
1332 command->val[3] = v3;
1334 dpsoftrast.uniform4f[index*4+0] = v0;
1335 dpsoftrast.uniform4f[index*4+1] = v1;
1336 dpsoftrast.uniform4f[index*4+2] = v2;
1337 dpsoftrast.uniform4f[index*4+3] = v3;
1339 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1341 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1342 command->index = index;
1343 memcpy(command->val, v, sizeof(command->val));
1345 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1348 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1349 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1351 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1353 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1357 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1359 __m128 m0, m1, m2, m3;
1360 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1361 command->index = index;
1362 if (((size_t)v)&(ALIGN_SIZE-1))
1364 m0 = _mm_loadu_ps(v);
1365 m1 = _mm_loadu_ps(v+4);
1366 m2 = _mm_loadu_ps(v+8);
1367 m3 = _mm_loadu_ps(v+12);
1371 m0 = _mm_load_ps(v);
1372 m1 = _mm_load_ps(v+4);
1373 m2 = _mm_load_ps(v+8);
1374 m3 = _mm_load_ps(v+12);
1378 __m128 t0, t1, t2, t3;
1379 t0 = _mm_unpacklo_ps(m0, m1);
1380 t1 = _mm_unpacklo_ps(m2, m3);
1381 t2 = _mm_unpackhi_ps(m0, m1);
1382 t3 = _mm_unpackhi_ps(m2, m3);
1383 m0 = _mm_movelh_ps(t0, t1);
1384 m1 = _mm_movehl_ps(t1, t0);
1385 m2 = _mm_movelh_ps(t2, t3);
1386 m3 = _mm_movehl_ps(t3, t2);
1388 _mm_store_ps(command->val, m0);
1389 _mm_store_ps(command->val+4, m1);
1390 _mm_store_ps(command->val+8, m2);
1391 _mm_store_ps(command->val+12, m3);
1392 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1393 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1394 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1395 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1400 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1401 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1403 thread->uniform1i[command->index] = command->val;
1405 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1407 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1408 command->index = index;
1411 dpsoftrast.uniform1i[command->index] = i0;
1415 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1417 float *end = dst + size*4;
1418 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1422 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1431 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1438 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1440 float *end = dst + size*4;
1441 if (stride == sizeof(float[3]))
1443 float *end4 = dst + (size&~3)*4;
1444 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1448 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1449 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1450 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1451 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1452 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1453 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1454 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1455 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1456 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1457 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1458 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1459 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1460 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1462 src += 4*sizeof(float[3]);
1469 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1470 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1471 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1472 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1473 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1474 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1475 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1476 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1477 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1478 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1479 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1480 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1481 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1483 src += 4*sizeof(float[3]);
1487 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1491 __m128 v = _mm_loadu_ps((const float *)src);
1492 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1493 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1494 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1495 _mm_store_ps(dst, v);
1504 __m128 v = _mm_load_ps((const float *)src);
1505 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1506 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1507 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1508 _mm_store_ps(dst, v);
1515 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1517 float *end = dst + size*4;
1518 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1519 if (stride == sizeof(float[2]))
1521 float *end2 = dst + (size&~1)*4;
1522 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1526 __m128 v = _mm_loadu_ps((const float *)src);
1527 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1528 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1530 src += 2*sizeof(float[2]);
1537 __m128 v = _mm_load_ps((const float *)src);
1538 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1539 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1541 src += 2*sizeof(float[2]);
1547 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1553 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1555 float *end = dst + size*4;
1556 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1557 if (stride == sizeof(unsigned char[4]))
1559 float *end4 = dst + (size&~3)*4;
1560 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1564 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1565 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1566 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1567 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1568 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1570 src += 4*sizeof(unsigned char[4]);
1577 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1578 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1579 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1580 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1581 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1583 src += 4*sizeof(unsigned char[4]);
1589 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1590 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1596 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1598 float *end = dst + 4*size;
1599 __m128 v = _mm_loadu_ps(src);
1602 _mm_store_ps(dst, v);
1608 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1611 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1612 __m128 m0, m1, m2, m3;
1614 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1616 // fast case for identity matrix
1617 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1620 end = out4f + numitems*4;
1621 m0 = _mm_loadu_ps(inmatrix16f);
1622 m1 = _mm_loadu_ps(inmatrix16f + 4);
1623 m2 = _mm_loadu_ps(inmatrix16f + 8);
1624 m3 = _mm_loadu_ps(inmatrix16f + 12);
1625 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1629 __m128 v = _mm_loadu_ps(in4f);
1631 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1632 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1633 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1634 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1643 __m128 v = _mm_load_ps(in4f);
1645 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1646 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1647 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1648 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1656 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1658 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1662 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1664 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1665 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1666 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1667 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1670 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1672 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1673 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1674 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1675 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1678 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1681 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1682 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1683 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1684 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1687 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1689 int clipmask = 0xFF;
1690 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1691 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1692 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1693 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1694 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1695 #define BBFRONT(k, pos) \
1697 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1698 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1699 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1702 clipmask &= ~(1<<k); \
1703 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1704 minproj = _mm_min_ss(minproj, proj); \
1705 maxproj = _mm_max_ss(maxproj, proj); \
1709 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1710 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1711 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1712 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1713 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1714 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1718 if (clipmask&(1<<k)) \
1720 if (!(clipmask&(1<<(k^1)))) \
1722 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1723 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1724 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1725 minproj = _mm_min_ss(minproj, proj); \
1726 maxproj = _mm_max_ss(maxproj, proj); \
1728 if (!(clipmask&(1<<(k^2)))) \
1730 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1731 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1732 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1733 minproj = _mm_min_ss(minproj, proj); \
1734 maxproj = _mm_max_ss(maxproj, proj); \
1736 if (!(clipmask&(1<<(k^4)))) \
1738 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1739 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1740 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1741 minproj = _mm_min_ss(minproj, proj); \
1742 maxproj = _mm_max_ss(maxproj, proj); \
1746 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1747 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1748 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1749 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1750 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1751 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1752 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1753 *starty = _mm_cvttss_si32(maxproj);
1754 *endy = _mm_cvttss_si32(minproj)+1;
1758 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1760 float *end = out4f + numitems*4;
1761 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1762 __m128 minpos, maxpos;
1763 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1765 minpos = maxpos = _mm_loadu_ps(in4f);
1768 __m128 v = _mm_loadu_ps(in4f);
1769 minpos = _mm_min_ps(minpos, v);
1770 maxpos = _mm_max_ps(maxpos, v);
1771 _mm_store_ps(out4f, v);
1772 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1773 _mm_store_ps(screen4f, v);
1781 minpos = maxpos = _mm_load_ps(in4f);
1784 __m128 v = _mm_load_ps(in4f);
1785 minpos = _mm_min_ps(minpos, v);
1786 maxpos = _mm_max_ps(maxpos, v);
1787 _mm_store_ps(out4f, v);
1788 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1789 _mm_store_ps(screen4f, v);
1796 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale,
1797 _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1798 _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1799 _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1800 _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1804 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1806 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1807 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1809 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1810 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1811 end = out4f + numitems*4;
1812 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1813 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1814 m0 = _mm_loadu_ps(inmatrix16f);
1815 m1 = _mm_loadu_ps(inmatrix16f + 4);
1816 m2 = _mm_loadu_ps(inmatrix16f + 8);
1817 m3 = _mm_loadu_ps(inmatrix16f + 12);
1818 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1820 minpos = maxpos = _mm_loadu_ps(in4f);
1823 __m128 v = _mm_loadu_ps(in4f);
1824 minpos = _mm_min_ps(minpos, v);
1825 maxpos = _mm_max_ps(maxpos, v);
1826 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1827 _mm_store_ps(out4f, v);
1828 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1829 _mm_store_ps(screen4f, v);
1837 minpos = maxpos = _mm_load_ps(in4f);
1840 __m128 v = _mm_load_ps(in4f);
1841 minpos = _mm_min_ps(minpos, v);
1842 maxpos = _mm_max_ps(maxpos, v);
1843 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1844 _mm_store_ps(out4f, v);
1845 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1846 _mm_store_ps(screen4f, v);
1853 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3);
1858 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1861 float *outf = dpsoftrast.post_array4f[outarray];
1862 const unsigned char *inb;
1863 int firstvertex = dpsoftrast.firstvertex;
1864 int numvertices = dpsoftrast.numvertices;
1868 case DPSOFTRAST_ARRAY_POSITION:
1869 stride = dpsoftrast.stride_vertex;
1870 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1871 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1873 case DPSOFTRAST_ARRAY_COLOR:
1874 stride = dpsoftrast.stride_color;
1875 if (dpsoftrast.pointer_color4f)
1877 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1878 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1880 else if (dpsoftrast.pointer_color4ub)
1882 stride = dpsoftrast.stride_color;
1883 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1884 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1888 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1892 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1893 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1895 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1896 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1899 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1902 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1905 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1917 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1919 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1920 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1925 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1928 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1929 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1937 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1940 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1941 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1948 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1951 int startx = span->startx;
1952 int endx = span->endx;
1953 float wslope = triangle->w[0];
1954 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1955 float endz = 1.0f / (w + wslope * startx);
1956 for (x = startx;x < endx;)
1958 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1960 if (nextsub >= endx) nextsub = endsub = endx-1;
1961 endz = 1.0f / (w + wslope * nextsub);
1962 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1963 for (; x <= endsub; x++, z += dz)
1968 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1971 int startx = span->startx;
1972 int endx = span->endx;
1975 unsigned char * RESTRICT pixelmask = span->pixelmask;
1976 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1979 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1980 // handle alphatest now (this affects depth writes too)
1981 if (thread->alphatest)
1982 for (x = startx;x < endx;x++)
1983 if (in4f[x*4+3] < 0.5f)
1984 pixelmask[x] = false;
1985 // FIXME: this does not handle bigendian
1986 switch(thread->fb_blendmode)
1988 case DPSOFTRAST_BLENDMODE_OPAQUE:
1989 for (x = startx;x < endx;x++)
1993 d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
1994 d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
1995 d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
1996 d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
1997 pixel[x*4+0] = d[0];
1998 pixel[x*4+1] = d[1];
1999 pixel[x*4+2] = d[2];
2000 pixel[x*4+3] = d[3];
2003 case DPSOFTRAST_BLENDMODE_ALPHA:
2004 for (x = startx;x < endx;x++)
2008 a = in4f[x*4+3] * 255.0f;
2009 b = 1.0f - in4f[x*4+3];
2010 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2011 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2012 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2013 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2014 pixel[x*4+0] = d[0];
2015 pixel[x*4+1] = d[1];
2016 pixel[x*4+2] = d[2];
2017 pixel[x*4+3] = d[3];
2020 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2021 for (x = startx;x < endx;x++)
2025 a = in4f[x*4+3] * 255.0f;
2026 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2027 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2028 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2029 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2030 pixel[x*4+0] = d[0];
2031 pixel[x*4+1] = d[1];
2032 pixel[x*4+2] = d[2];
2033 pixel[x*4+3] = d[3];
2036 case DPSOFTRAST_BLENDMODE_ADD:
2037 for (x = startx;x < endx;x++)
2041 d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2042 d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2043 d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2044 d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2045 pixel[x*4+0] = d[0];
2046 pixel[x*4+1] = d[1];
2047 pixel[x*4+2] = d[2];
2048 pixel[x*4+3] = d[3];
2051 case DPSOFTRAST_BLENDMODE_INVMOD:
2052 for (x = startx;x < endx;x++)
2056 d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2057 d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2058 d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2059 d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2060 pixel[x*4+0] = d[0];
2061 pixel[x*4+1] = d[1];
2062 pixel[x*4+2] = d[2];
2063 pixel[x*4+3] = d[3];
2066 case DPSOFTRAST_BLENDMODE_MUL:
2067 for (x = startx;x < endx;x++)
2071 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2072 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2073 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2074 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2075 pixel[x*4+0] = d[0];
2076 pixel[x*4+1] = d[1];
2077 pixel[x*4+2] = d[2];
2078 pixel[x*4+3] = d[3];
2081 case DPSOFTRAST_BLENDMODE_MUL2:
2082 for (x = startx;x < endx;x++)
2086 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2087 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2088 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2089 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2090 pixel[x*4+0] = d[0];
2091 pixel[x*4+1] = d[1];
2092 pixel[x*4+2] = d[2];
2093 pixel[x*4+3] = d[3];
2096 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2097 for (x = startx;x < endx;x++)
2101 a = in4f[x*4+3] * -255.0f;
2102 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2103 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2104 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2105 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2106 pixel[x*4+0] = d[0];
2107 pixel[x*4+1] = d[1];
2108 pixel[x*4+2] = d[2];
2109 pixel[x*4+3] = d[3];
2112 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2113 for (x = startx;x < endx;x++)
2118 b = 1.0f - in4f[x*4+3];
2119 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2120 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2121 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2122 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2123 pixel[x*4+0] = d[0];
2124 pixel[x*4+1] = d[1];
2125 pixel[x*4+2] = d[2];
2126 pixel[x*4+3] = d[3];
2129 case DPSOFTRAST_BLENDMODE_INVADD:
2130 for (x = startx;x < endx;x++)
2134 d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2135 d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2136 d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2137 d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2138 pixel[x*4+0] = d[0];
2139 pixel[x*4+1] = d[1];
2140 pixel[x*4+2] = d[2];
2141 pixel[x*4+3] = d[3];
2147 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2151 int startx = span->startx;
2152 int endx = span->endx;
2153 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2154 unsigned char * RESTRICT pixelmask = span->pixelmask;
2155 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2156 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2159 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2160 pixeli += span->y * dpsoftrast.fb_width + span->x;
2161 // handle alphatest now (this affects depth writes too)
2162 if (thread->alphatest)
2163 for (x = startx;x < endx;x++)
2164 if (in4ub[x*4+3] < 0.5f)
2165 pixelmask[x] = false;
2166 // FIXME: this does not handle bigendian
2167 switch(thread->fb_blendmode)
2169 case DPSOFTRAST_BLENDMODE_OPAQUE:
2170 for (x = startx;x + 4 <= endx;)
2172 if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2174 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2188 case DPSOFTRAST_BLENDMODE_ALPHA:
2189 #define FINISHBLEND(blend2, blend1) \
2190 for (x = startx;x + 1 < endx;x += 2) \
2193 switch (*(const unsigned short*)&pixelmask[x]) \
2196 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2197 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2199 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2202 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2203 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2205 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2208 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2209 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2211 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2216 for(;x < endx; x++) \
2219 if (!pixelmask[x]) \
2221 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2222 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2224 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2228 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2229 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2231 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2232 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2235 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2237 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2238 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2240 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2241 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2244 case DPSOFTRAST_BLENDMODE_ADD:
2245 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2247 case DPSOFTRAST_BLENDMODE_INVMOD:
2249 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2251 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2254 case DPSOFTRAST_BLENDMODE_MUL:
2255 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2257 case DPSOFTRAST_BLENDMODE_MUL2:
2258 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2260 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2262 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2263 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2265 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2266 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2269 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2271 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2272 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2274 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2275 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2278 case DPSOFTRAST_BLENDMODE_INVADD:
2280 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2282 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2289 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2292 int startx = span->startx;
2293 int endx = span->endx;
2298 float tc[2], endtc[2];
2300 unsigned int tci[2];
2301 unsigned int tci1[2];
2302 unsigned int tcimin[2];
2303 unsigned int tcimax[2];
2308 const unsigned char * RESTRICT pixelbase;
2309 const unsigned char * RESTRICT pixel[4];
2310 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2311 // if no texture is bound, just fill it with white
2314 for (x = startx;x < endx;x++)
2316 out4f[x*4+0] = 1.0f;
2317 out4f[x*4+1] = 1.0f;
2318 out4f[x*4+2] = 1.0f;
2319 out4f[x*4+3] = 1.0f;
2323 mip = triangle->mip[texunitindex];
2324 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2325 // if this mipmap of the texture is 1 pixel, just fill it with that color
2326 if (texture->mipmap[mip][1] == 4)
2328 c[0] = texture->bytes[2] * (1.0f/255.0f);
2329 c[1] = texture->bytes[1] * (1.0f/255.0f);
2330 c[2] = texture->bytes[0] * (1.0f/255.0f);
2331 c[3] = texture->bytes[3] * (1.0f/255.0f);
2332 for (x = startx;x < endx;x++)
2334 out4f[x*4+0] = c[0];
2335 out4f[x*4+1] = c[1];
2336 out4f[x*4+2] = c[2];
2337 out4f[x*4+3] = c[3];
2341 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2342 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2343 flags = texture->flags;
2344 tcscale[0] = texture->mipmap[mip][2];
2345 tcscale[1] = texture->mipmap[mip][3];
2346 tciwidth = texture->mipmap[mip][2];
2349 tcimax[0] = texture->mipmap[mip][2]-1;
2350 tcimax[1] = texture->mipmap[mip][3]-1;
2351 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2352 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2353 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2354 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2355 for (x = startx;x < endx;)
2357 unsigned int subtc[2];
2358 unsigned int substep[2];
2359 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2360 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2361 if (nextsub >= endx)
2363 nextsub = endsub = endx-1;
2364 if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2368 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2369 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2370 substep[0] = (endtc[0] - tc[0]) * subscale;
2371 substep[1] = (endtc[1] - tc[1]) * subscale;
2372 subtc[0] = tc[0] * (1<<16);
2373 subtc[1] = tc[1] * (1<<16);
2376 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2378 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2380 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2381 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2382 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2383 tci[0] = subtc[0]>>16;
2384 tci[1] = subtc[1]>>16;
2385 tci1[0] = tci[0] + 1;
2386 tci1[1] = tci[1] + 1;
2387 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2388 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2389 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2390 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2391 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2392 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2393 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2394 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2395 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2396 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2397 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2398 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2399 out4f[x*4+0] = c[0];
2400 out4f[x*4+1] = c[1];
2401 out4f[x*4+2] = c[2];
2402 out4f[x*4+3] = c[3];
2407 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2409 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2410 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2411 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2412 tci[0] = subtc[0]>>16;
2413 tci[1] = subtc[1]>>16;
2414 tci1[0] = tci[0] + 1;
2415 tci1[1] = tci[1] + 1;
2416 tci[0] &= tciwrapmask[0];
2417 tci[1] &= tciwrapmask[1];
2418 tci1[0] &= tciwrapmask[0];
2419 tci1[1] &= tciwrapmask[1];
2420 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2421 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2422 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2423 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2424 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2425 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2426 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2427 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2428 out4f[x*4+0] = c[0];
2429 out4f[x*4+1] = c[1];
2430 out4f[x*4+2] = c[2];
2431 out4f[x*4+3] = c[3];
2435 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2437 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2439 tci[0] = subtc[0]>>16;
2440 tci[1] = subtc[1]>>16;
2441 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2442 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2443 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2444 c[0] = pixel[0][2] * (1.0f / 255.0f);
2445 c[1] = pixel[0][1] * (1.0f / 255.0f);
2446 c[2] = pixel[0][0] * (1.0f / 255.0f);
2447 c[3] = pixel[0][3] * (1.0f / 255.0f);
2448 out4f[x*4+0] = c[0];
2449 out4f[x*4+1] = c[1];
2450 out4f[x*4+2] = c[2];
2451 out4f[x*4+3] = c[3];
2456 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2458 tci[0] = subtc[0]>>16;
2459 tci[1] = subtc[1]>>16;
2460 tci[0] &= tciwrapmask[0];
2461 tci[1] &= tciwrapmask[1];
2462 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2463 c[0] = pixel[0][2] * (1.0f / 255.0f);
2464 c[1] = pixel[0][1] * (1.0f / 255.0f);
2465 c[2] = pixel[0][0] * (1.0f / 255.0f);
2466 c[3] = pixel[0][3] * (1.0f / 255.0f);
2467 out4f[x*4+0] = c[0];
2468 out4f[x*4+1] = c[1];
2469 out4f[x*4+2] = c[2];
2470 out4f[x*4+3] = c[3];
2476 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2480 int startx = span->startx;
2481 int endx = span->endx;
2483 __m128 data, slope, tcscale;
2484 __m128i tcsize, tcmask, tcoffset, tcmax;
2486 __m128i subtc, substep, endsubtc;
2489 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2490 const unsigned char * RESTRICT pixelbase;
2491 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2492 // if no texture is bound, just fill it with white
2495 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2498 mip = triangle->mip[texunitindex];
2499 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2500 // if this mipmap of the texture is 1 pixel, just fill it with that color
2501 if (texture->mipmap[mip][1] == 4)
2503 unsigned int k = *((const unsigned int *)pixelbase);
2504 for (x = startx;x < endx;x++)
2508 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2509 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2510 flags = texture->flags;
2511 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2512 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2513 tcscale = _mm_cvtepi32_ps(tcsize);
2514 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2515 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2516 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2517 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2518 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2519 tcmax = _mm_packs_epi32(tcmask, tcmask);
2520 for (x = startx;x < endx;)
2522 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2523 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2524 if (nextsub >= endx)
2526 nextsub = endsub = endx-1;
2527 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2531 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2532 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2533 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2534 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2535 substep = _mm_slli_epi32(substep, 1);
2538 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2539 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2541 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2542 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2544 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2545 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2546 tci = _mm_madd_epi16(tci, tcoffset);
2547 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2548 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2549 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2550 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2551 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2552 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2553 fracm = _mm_srli_epi16(subtc, 1);
2554 pix1 = _mm_add_epi16(pix1,
2555 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2556 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2557 pix3 = _mm_add_epi16(pix3,
2558 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2559 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2560 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2561 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2562 pix2 = _mm_add_epi16(pix2,
2563 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2564 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2565 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2569 const unsigned char * RESTRICT ptr1;
2570 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2571 tci = _mm_madd_epi16(tci, tcoffset);
2572 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2573 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2574 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2575 fracm = _mm_srli_epi16(subtc, 1);
2576 pix1 = _mm_add_epi16(pix1,
2577 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2578 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2579 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2580 pix1 = _mm_add_epi16(pix1,
2581 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2582 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2583 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2587 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2589 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2591 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2592 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2593 tci = _mm_madd_epi16(tci, tcoffset);
2594 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2595 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2596 _mm_setzero_si128());
2597 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2598 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2599 _mm_setzero_si128());
2600 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2601 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2602 tci = _mm_madd_epi16(tci, tcoffset);
2603 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2604 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2605 _mm_setzero_si128());
2606 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2607 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2608 _mm_setzero_si128());
2609 fracm = _mm_srli_epi16(subtc, 1);
2610 pix1 = _mm_add_epi16(pix1,
2611 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2612 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2613 pix3 = _mm_add_epi16(pix3,
2614 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2615 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2616 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2617 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2618 pix2 = _mm_add_epi16(pix2,
2619 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2620 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2621 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2625 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2626 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2627 tci = _mm_madd_epi16(tci, tcoffset);
2628 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2629 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2630 _mm_setzero_si128());
2631 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2632 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2633 _mm_setzero_si128());
2634 fracm = _mm_srli_epi16(subtc, 1);
2635 pix1 = _mm_add_epi16(pix1,
2636 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2637 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2638 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2639 pix1 = _mm_add_epi16(pix1,
2640 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2641 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2642 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2648 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2650 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2651 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2652 tci = _mm_madd_epi16(tci, tcoffset);
2653 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2654 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2655 _mm_setzero_si128());
2656 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2657 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2658 _mm_setzero_si128());
2659 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2660 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2661 tci = _mm_madd_epi16(tci, tcoffset);
2662 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2663 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2664 _mm_setzero_si128());
2665 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2666 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2667 _mm_setzero_si128());
2668 fracm = _mm_srli_epi16(subtc, 1);
2669 pix1 = _mm_add_epi16(pix1,
2670 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2671 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2672 pix3 = _mm_add_epi16(pix3,
2673 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2674 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2675 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2676 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2677 pix2 = _mm_add_epi16(pix2,
2678 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2679 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2680 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2684 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2685 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2686 tci = _mm_madd_epi16(tci, tcoffset);
2687 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2688 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2689 _mm_setzero_si128());
2690 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2691 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2692 _mm_setzero_si128());
2693 fracm = _mm_srli_epi16(subtc, 1);
2694 pix1 = _mm_add_epi16(pix1,
2695 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2696 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2697 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2698 pix1 = _mm_add_epi16(pix1,
2699 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2700 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2701 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2708 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2710 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2712 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2713 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2714 tci = _mm_madd_epi16(tci, tcoffset);
2715 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2716 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2720 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2721 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2722 tci = _mm_madd_epi16(tci, tcoffset);
2723 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2729 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2731 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2732 tci = _mm_and_si128(tci, tcmax);
2733 tci = _mm_madd_epi16(tci, tcoffset);
2734 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2735 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2739 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2740 tci = _mm_and_si128(tci, tcmax);
2741 tci = _mm_madd_epi16(tci, tcoffset);
2742 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2751 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2754 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2757 float DPSOFTRAST_SampleShadowmap(const float *vector)
2763 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2766 int startx = span->startx;
2767 int endx = span->endx;
2772 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2773 for (x = startx;x < endx;x++)
2776 c[0] = (data[0] + slope[0]*x) * z;
2777 c[1] = (data[1] + slope[1]*x) * z;
2778 c[2] = (data[2] + slope[2]*x) * z;
2779 c[3] = (data[3] + slope[3]*x) * z;
2780 out4f[x*4+0] = in4f[x*4+0] * c[0];
2781 out4f[x*4+1] = in4f[x*4+1] * c[1];
2782 out4f[x*4+2] = in4f[x*4+2] * c[2];
2783 out4f[x*4+3] = in4f[x*4+3] * c[3];
2787 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2790 int startx = span->startx;
2791 int endx = span->endx;
2796 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2797 for (x = startx;x < endx;x++)
2800 c[0] = (data[0] + slope[0]*x) * z;
2801 c[1] = (data[1] + slope[1]*x) * z;
2802 c[2] = (data[2] + slope[2]*x) * z;
2803 c[3] = (data[3] + slope[3]*x) * z;
2804 out4f[x*4+0] = c[0];
2805 out4f[x*4+1] = c[1];
2806 out4f[x*4+2] = c[2];
2807 out4f[x*4+3] = c[3];
2811 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2813 int x, startx = span->startx, endx = span->endx;
2814 float c[4], localcolor[4];
2815 localcolor[0] = subcolor[0];
2816 localcolor[1] = subcolor[1];
2817 localcolor[2] = subcolor[2];
2818 localcolor[3] = subcolor[3];
2819 for (x = startx;x < endx;x++)
2821 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2822 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2823 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2824 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2825 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2826 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2827 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2828 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2832 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2834 int x, startx = span->startx, endx = span->endx;
2835 for (x = startx;x < endx;x++)
2837 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2838 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2839 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2840 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2844 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2846 int x, startx = span->startx, endx = span->endx;
2847 for (x = startx;x < endx;x++)
2849 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2850 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2851 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2852 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2856 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2858 int x, startx = span->startx, endx = span->endx;
2860 for (x = startx;x < endx;x++)
2862 a = 1.0f - inb4f[x*4+3];
2864 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2865 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2866 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2867 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2871 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2873 int x, startx = span->startx, endx = span->endx;
2874 float localcolor[4], ilerp, lerp;
2875 localcolor[0] = color[0];
2876 localcolor[1] = color[1];
2877 localcolor[2] = color[2];
2878 localcolor[3] = color[3];
2879 ilerp = 1.0f - localcolor[3];
2880 lerp = localcolor[3];
2881 for (x = startx;x < endx;x++)
2883 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2884 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2885 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2886 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2892 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2896 int startx = span->startx;
2897 int endx = span->endx;
2900 __m128i submod, substep, endsubmod;
2901 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2902 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2903 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2904 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2905 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2906 for (x = startx; x < endx;)
2908 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2909 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2910 if (nextsub >= endx)
2912 nextsub = endsub = endx-1;
2913 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2917 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2918 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2919 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2920 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2921 substep = _mm_packs_epi32(substep, substep);
2922 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2924 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2925 pix = _mm_mulhi_epu16(pix, submod);
2926 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2930 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2931 pix = _mm_mulhi_epu16(pix, submod);
2932 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2939 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2943 int startx = span->startx;
2944 int endx = span->endx;
2947 __m128i submod, substep, endsubmod;
2948 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2949 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2950 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2951 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2952 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2953 for (x = startx; x < endx;)
2955 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2956 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2957 if (nextsub >= endx)
2959 nextsub = endsub = endx-1;
2960 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2964 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2965 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2966 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2967 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2968 substep = _mm_packs_epi32(substep, substep);
2969 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2971 __m128i pix = _mm_srai_epi16(submod, 4);
2972 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2976 __m128i pix = _mm_srai_epi16(submod, 4);
2977 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2984 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2987 int x, startx = span->startx, endx = span->endx;
2988 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2989 localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2990 for (x = startx;x+2 <= endx;x+=2)
2992 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2993 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2994 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2995 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2999 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3000 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3001 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
3002 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3007 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3010 int x, startx = span->startx, endx = span->endx;
3011 for (x = startx;x+2 <= endx;x+=2)
3013 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3014 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3015 pix1 = _mm_mulhi_epu16(pix1, pix2);
3016 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3020 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3021 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3022 pix1 = _mm_mulhi_epu16(pix1, pix2);
3023 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3028 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3031 int x, startx = span->startx, endx = span->endx;
3032 for (x = startx;x+2 <= endx;x+=2)
3034 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3035 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3036 pix1 = _mm_add_epi16(pix1, pix2);
3037 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3041 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3042 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3043 pix1 = _mm_add_epi16(pix1, pix2);
3044 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3049 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3052 int x, startx = span->startx, endx = span->endx;
3053 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3054 tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
3055 for (x = startx;x+2 <= endx;x+=2)
3057 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3058 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3059 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3060 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3064 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3065 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3066 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3067 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3072 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3075 int x, startx = span->startx, endx = span->endx;
3076 for (x = startx;x+2 <= endx;x+=2)
3078 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3079 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3080 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3081 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3082 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3086 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3087 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3088 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3089 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3090 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3095 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3098 int x, startx = span->startx, endx = span->endx;
3099 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3100 localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
3101 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3102 for (x = startx;x+2 <= endx;x+=2)
3104 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3105 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3106 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3110 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3111 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3112 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3119 void DPSOFTRAST_VertexShader_Generic(void)
3121 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3122 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3123 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3124 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3125 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3128 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3130 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3131 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3132 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3133 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3134 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3135 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3137 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3138 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3139 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3141 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3142 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3145 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3147 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3150 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3152 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3155 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3160 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3161 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3166 void DPSOFTRAST_VertexShader_PostProcess(void)
3168 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3169 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3170 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3173 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3175 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3176 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3177 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3178 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3179 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3180 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3181 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3183 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3184 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3186 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3187 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3189 // TODO: implement saturation
3191 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3193 // TODO: implement gammaramps
3195 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3200 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3202 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3205 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3207 // this is never called (because colormask is off when this shader is used)
3208 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3209 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3210 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3211 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3212 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3217 void DPSOFTRAST_VertexShader_FlatColor(void)
3219 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3220 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3223 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3226 unsigned char * RESTRICT pixelmask = span->pixelmask;
3227 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3228 int x, startx = span->startx, endx = span->endx;
3229 __m128i Color_Ambientm;
3230 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3231 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3232 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3233 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3234 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3235 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3236 pixel = buffer_FragColorbgra8;
3237 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3238 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3239 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3240 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3241 for (x = startx;x < endx;x++)
3244 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3247 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3248 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3249 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3250 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3256 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3257 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3258 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3260 if (pixel == buffer_FragColorbgra8)
3261 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3267 void DPSOFTRAST_VertexShader_VertexColor(void)
3269 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3270 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3271 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3274 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3277 unsigned char * RESTRICT pixelmask = span->pixelmask;
3278 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3279 int x, startx = span->startx, endx = span->endx;
3280 __m128i Color_Ambientm, Color_Diffusem;
3282 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3283 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3284 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3285 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3286 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3287 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3288 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3289 pixel = buffer_FragColorbgra8;
3290 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3291 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3292 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3293 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3294 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3295 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3296 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3297 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3298 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3299 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3300 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3301 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3302 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3303 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3305 __m128i color, mod, pix;
3306 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3309 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3310 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3311 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3312 data = _mm_add_ps(data, slope);
3313 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3314 data = _mm_add_ps(data, slope);
3315 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3316 data = _mm_add_ps(data, slope);
3317 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3318 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3319 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3320 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3321 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3322 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3328 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3329 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3330 mod = _mm_packs_epi32(mod, mod);
3331 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3332 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3334 if (pixel == buffer_FragColorbgra8)
3335 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3341 void DPSOFTRAST_VertexShader_Lightmap(void)
3343 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3344 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3345 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3348 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3351 unsigned char * RESTRICT pixelmask = span->pixelmask;
3352 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3353 int x, startx = span->startx, endx = span->endx;
3354 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3355 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3356 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3357 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3358 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3359 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3360 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3361 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3362 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3363 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3364 pixel = buffer_FragColorbgra8;
3365 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3366 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3367 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3368 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3369 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3370 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3371 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3372 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3374 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3375 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3376 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3377 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3378 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3379 for (x = startx;x < endx;x++)
3381 __m128i color, lightmap, glow, pix;
3382 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3385 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3386 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3387 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3388 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3389 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3390 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3391 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3392 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3393 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3394 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3400 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3401 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3402 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3403 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3404 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3405 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3410 for (x = startx;x < endx;x++)
3412 __m128i color, lightmap, pix;
3413 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3416 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3417 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3418 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3419 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3420 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3421 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3422 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3428 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3429 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3430 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3431 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3434 if (pixel == buffer_FragColorbgra8)
3435 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3441 void DPSOFTRAST_VertexShader_FakeLight(void)
3443 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3446 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3449 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3450 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3451 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3452 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3453 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3458 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3460 DPSOFTRAST_VertexShader_Lightmap();
3463 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3465 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3471 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3473 DPSOFTRAST_VertexShader_Lightmap();
3476 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3478 DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3484 void DPSOFTRAST_VertexShader_LightDirection(void)
3487 int numvertices = dpsoftrast.numvertices;
3489 float LightVector[4];
3490 float EyePosition[4];
3491 float EyeVectorModelSpace[4];
3497 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3498 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3499 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3500 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3501 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3502 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3503 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3504 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3505 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3506 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3507 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3508 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3509 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3510 for (i = 0;i < numvertices;i++)
3512 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3513 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3514 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3515 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3516 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3517 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3518 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3519 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3520 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3521 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3522 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3523 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3524 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3525 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3526 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3527 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3528 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3529 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3530 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3531 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3532 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3533 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3534 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3535 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3536 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3537 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3538 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3539 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3540 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3542 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3545 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3546 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3547 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3548 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3549 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3550 #define DPSOFTRAST_Vector3Normalize(v)\
3553 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3564 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3566 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3567 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3568 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3569 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3570 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3571 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3572 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3573 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3574 int x, startx = span->startx, endx = span->endx;
3575 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3576 float LightVectordata[4];
3577 float LightVectorslope[4];
3578 float EyeVectordata[4];
3579 float EyeVectorslope[4];
3581 float diffusetex[4];
3583 float surfacenormal[4];
3584 float lightnormal[4];
3586 float specularnormal[4];
3589 float SpecularPower;
3591 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3592 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3593 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3594 Color_Glow[3] = 0.0f;
3595 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3596 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3597 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3598 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3599 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3600 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3601 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3602 Color_Pants[3] = 0.0f;
3603 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3604 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3605 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3606 Color_Shirt[3] = 0.0f;
3607 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3608 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3609 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3611 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3612 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3614 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3616 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3618 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3620 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3621 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3622 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3623 Color_Diffuse[3] = 0.0f;
3624 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3625 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3626 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3627 LightColor[3] = 0.0f;
3628 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3629 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3630 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3631 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3632 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3633 Color_Specular[3] = 0.0f;
3634 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3635 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3636 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3637 for (x = startx;x < endx;x++)
3640 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3641 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3642 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3643 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3644 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3646 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3647 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3648 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3649 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3651 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3652 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3653 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3654 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3655 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3656 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3657 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3658 DPSOFTRAST_Vector3Normalize(surfacenormal);
3660 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3661 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3662 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3663 DPSOFTRAST_Vector3Normalize(lightnormal);
3665 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3666 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3667 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3668 DPSOFTRAST_Vector3Normalize(eyenormal);
3670 specularnormal[0] = lightnormal[0] + eyenormal[0];
3671 specularnormal[1] = lightnormal[1] + eyenormal[1];
3672 specularnormal[2] = lightnormal[2] + eyenormal[2];
3673 DPSOFTRAST_Vector3Normalize(specularnormal);
3675 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3676 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3677 specular = pow(specular, SpecularPower * glosstex[3]);
3678 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3680 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3681 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3682 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3683 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3687 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3688 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3689 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3690 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3692 buffer_FragColorbgra8[x*4+0] = d[0];
3693 buffer_FragColorbgra8[x*4+1] = d[1];
3694 buffer_FragColorbgra8[x*4+2] = d[2];
3695 buffer_FragColorbgra8[x*4+3] = d[3];
3698 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3700 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3701 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3702 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3703 Color_Diffuse[3] = 0.0f;
3704 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3705 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3706 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3707 LightColor[3] = 0.0f;
3708 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3709 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3710 for (x = startx;x < endx;x++)
3713 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3714 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3715 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3716 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3717 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3718 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3719 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3720 DPSOFTRAST_Vector3Normalize(surfacenormal);
3722 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3723 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3724 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3725 DPSOFTRAST_Vector3Normalize(lightnormal);
3727 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3728 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3730 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3731 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3732 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3733 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3737 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3738 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3739 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3740 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3742 buffer_FragColorbgra8[x*4+0] = d[0];
3743 buffer_FragColorbgra8[x*4+1] = d[1];
3744 buffer_FragColorbgra8[x*4+2] = d[2];
3745 buffer_FragColorbgra8[x*4+3] = d[3];
3750 for (x = startx;x < endx;x++)
3753 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3754 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3755 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3756 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3758 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3760 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3761 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3762 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3763 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3767 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3768 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3769 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3770 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3772 buffer_FragColorbgra8[x*4+0] = d[0];
3773 buffer_FragColorbgra8[x*4+1] = d[1];
3774 buffer_FragColorbgra8[x*4+2] = d[2];
3775 buffer_FragColorbgra8[x*4+3] = d[3];
3778 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3783 void DPSOFTRAST_VertexShader_LightSource(void)
3786 int numvertices = dpsoftrast.numvertices;
3787 float LightPosition[4];
3788 float LightVector[4];
3789 float LightVectorModelSpace[4];
3790 float EyePosition[4];
3791 float EyeVectorModelSpace[4];
3797 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3798 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3799 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3800 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3801 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3802 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3803 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3804 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3805 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3806 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3807 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3808 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3809 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3810 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3811 for (i = 0;i < numvertices;i++)
3813 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3814 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3815 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3816 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3817 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3818 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3819 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3820 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3821 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3822 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3823 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3824 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3825 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3826 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3827 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3828 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3829 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3830 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
3831 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3832 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3833 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3834 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3835 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3836 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3837 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3838 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3839 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3840 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3841 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3842 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3843 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3844 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3846 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3847 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3850 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3853 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3854 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3855 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3856 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3857 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3858 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3859 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3860 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3861 int x, startx = span->startx, endx = span->endx;
3862 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3863 float CubeVectordata[4];
3864 float CubeVectorslope[4];
3865 float LightVectordata[4];
3866 float LightVectorslope[4];
3867 float EyeVectordata[4];
3868 float EyeVectorslope[4];
3870 float diffusetex[4];
3872 float surfacenormal[4];
3873 float lightnormal[4];
3875 float specularnormal[4];
3878 float SpecularPower;
3879 float CubeVector[4];
3882 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3883 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3884 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3885 Color_Glow[3] = 0.0f;
3886 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3887 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3888 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3889 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3890 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3891 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3892 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3893 Color_Diffuse[3] = 0.0f;
3894 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3895 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3896 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3897 Color_Specular[3] = 0.0f;
3898 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3899 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3900 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3901 Color_Pants[3] = 0.0f;
3902 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3903 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3904 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3905 Color_Shirt[3] = 0.0f;
3906 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3907 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3908 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3909 LightColor[3] = 0.0f;
3910 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3911 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3912 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3913 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3914 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3915 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
3916 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3917 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3919 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3920 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3922 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3923 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
3924 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3926 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3927 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3928 for (x = startx;x < endx;x++)
3931 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3932 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3933 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3934 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3935 if (attenuation < 0.01f)
3937 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3939 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3940 if (attenuation < 0.01f)
3944 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3945 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3946 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3947 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3948 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3950 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3951 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3952 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3953 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3955 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3956 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3957 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3958 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3959 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3960 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3961 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3962 DPSOFTRAST_Vector3Normalize(surfacenormal);
3964 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3965 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3966 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3967 DPSOFTRAST_Vector3Normalize(lightnormal);
3969 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3970 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3971 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3972 DPSOFTRAST_Vector3Normalize(eyenormal);
3974 specularnormal[0] = lightnormal[0] + eyenormal[0];
3975 specularnormal[1] = lightnormal[1] + eyenormal[1];
3976 specularnormal[2] = lightnormal[2] + eyenormal[2];
3977 DPSOFTRAST_Vector3Normalize(specularnormal);
3979 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3980 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3981 specular = pow(specular, SpecularPower * glosstex[3]);
3982 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3984 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3985 attenuation *= (1.0f / 255.0f);
3986 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3987 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3988 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3989 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3993 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
3994 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
3995 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
3996 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
3998 buffer_FragColorbgra8[x*4+0] = d[0];
3999 buffer_FragColorbgra8[x*4+1] = d[1];
4000 buffer_FragColorbgra8[x*4+2] = d[2];
4001 buffer_FragColorbgra8[x*4+3] = d[3];
4004 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4006 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4007 for (x = startx;x < endx;x++)
4010 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4011 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4012 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4013 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4014 if (attenuation < 0.01f)
4016 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4018 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4019 if (attenuation < 0.01f)
4023 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4024 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4025 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4026 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4027 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4029 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4030 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4031 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4032 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4034 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4035 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4036 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4037 DPSOFTRAST_Vector3Normalize(surfacenormal);
4039 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4040 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4041 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4042 DPSOFTRAST_Vector3Normalize(lightnormal);
4044 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4045 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4047 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4048 attenuation *= (1.0f / 255.0f);
4049 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4050 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4051 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4052 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4056 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4057 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4058 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4059 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4061 buffer_FragColorbgra8[x*4+0] = d[0];
4062 buffer_FragColorbgra8[x*4+1] = d[1];
4063 buffer_FragColorbgra8[x*4+2] = d[2];
4064 buffer_FragColorbgra8[x*4+3] = d[3];
4069 for (x = startx;x < endx;x++)
4072 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4073 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4074 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4075 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4076 if (attenuation < 0.01f)
4078 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4080 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4081 if (attenuation < 0.01f)
4085 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4086 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4087 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4088 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4089 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4091 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4092 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4093 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4094 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4096 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4098 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4099 attenuation *= (1.0f / 255.0f);
4100 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4101 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4102 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4103 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4107 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4108 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4109 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4110 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4112 buffer_FragColorbgra8[x*4+0] = d[0];
4113 buffer_FragColorbgra8[x*4+1] = d[1];
4114 buffer_FragColorbgra8[x*4+2] = d[2];
4115 buffer_FragColorbgra8[x*4+3] = d[3];
4118 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4124 void DPSOFTRAST_VertexShader_Refraction(void)
4126 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4129 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4132 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4133 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4134 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4135 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4136 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4141 void DPSOFTRAST_VertexShader_Water(void)
4143 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4147 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4150 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4151 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4152 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4153 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4154 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4159 void DPSOFTRAST_VertexShader_ShowDepth(void)
4161 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4164 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4167 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4168 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4169 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4170 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4171 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4176 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4178 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4181 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4184 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4185 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4186 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4187 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4188 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4193 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4195 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4198 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4201 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4202 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4203 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4204 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4205 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4210 typedef struct DPSOFTRAST_ShaderModeInfo_s
4213 void (*Vertex)(void);
4214 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4215 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4216 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4218 DPSOFTRAST_ShaderModeInfo;
4220 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4222 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4223 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4224 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4225 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4226 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4227 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4228 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {~0}, {~0}},
4229 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4230 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4231 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4232 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4233 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {~0}},
4234 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4235 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4236 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4237 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}}
4240 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4247 // unsigned int *colorpixel;
4248 unsigned int *depthpixel;
4254 DPSOFTRAST_State_Triangle *triangle;
4255 DPSOFTRAST_State_Span *span;
4256 unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4257 for (i = 0; i < thread->numspans; i++)
4259 span = &thread->spans[i];
4260 triangle = &thread->triangles[span->triangle];
4261 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4263 wslope = triangle->w[0];
4264 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4265 depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4266 depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4267 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4268 startx = span->startx;
4270 switch(thread->fb_depthfunc)
4273 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4274 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4275 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4276 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4277 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4278 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4279 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4281 //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4282 //for (x = startx;x < endx;x++)
4283 // colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4284 // if there is no color buffer, skip pixel shader
4285 while (startx < endx && !pixelmask[startx])
4287 while (endx > startx && !pixelmask[endx-1])
4290 continue; // no pixels to fill
4291 span->pixelmask = pixelmask;
4292 span->startx = startx;
4294 // run pixel shader if appropriate
4295 // do this before running depthmask code, to allow the pixelshader
4296 // to clear pixelmask values for alpha testing
4297 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4298 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4299 if (thread->depthmask)
4300 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4306 // no depth testing means we're just dealing with color...
4307 // if there is no color buffer, skip pixel shader
4308 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4310 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4311 span->pixelmask = pixelmask;
4312 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4316 thread->numspans = 0;
4319 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4321 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4324 int cullface = thread->cullface;
4325 int minx, maxx, miny, maxy;
4326 int miny1, maxy1, miny2, maxy2;
4327 __m128i fbmin, fbmax;
4328 __m128 viewportcenter, viewportscale;
4329 int firstvertex = command->firstvertex;
4330 int numvertices = command->numvertices;
4331 int numtriangles = command->numtriangles;
4332 const int *element3i = command->element3i;
4333 const unsigned short *element3s = command->element3s;
4334 int clipped = command->clipped;
4341 int starty, endy, bandy;
4345 __m128 triangleedge1, triangleedge2, trianglenormal;
4348 DPSOFTRAST_State_Triangle *triangle;
4349 DPSOFTRAST_Texture *texture;
4350 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4351 miny = thread->fb_scissor[1];
4352 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4353 miny1 = bound(miny, thread->miny1, maxy);
4354 maxy1 = bound(miny, thread->maxy1, maxy);
4355 miny2 = bound(miny, thread->miny2, maxy);
4356 maxy2 = bound(miny, thread->maxy2, maxy);
4357 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4359 if (!ATOMIC_DECREMENT(command->refcount))
4361 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4362 MM_FREE(command->arrays);
4366 minx = thread->fb_scissor[0];
4367 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4368 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4369 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4370 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4371 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4372 screen[3] = _mm_setzero_ps();
4373 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4374 for (i = 0;i < numtriangles;i++)
4376 const float *screencoord4f = command->arrays;
4377 const float *arrays = screencoord4f + numvertices*4;
4379 // generate the 3 edges of this triangle
4380 // generate spans for the triangle - switch based on left split or right split classification of triangle
4383 e[0] = element3s[i*3+0] - firstvertex;
4384 e[1] = element3s[i*3+1] - firstvertex;
4385 e[2] = element3s[i*3+2] - firstvertex;
4389 e[0] = element3i[i*3+0] - firstvertex;
4390 e[1] = element3i[i*3+1] - firstvertex;
4391 e[2] = element3i[i*3+2] - firstvertex;
4400 #define SKIPBACKFACE \
4401 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4402 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4403 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4404 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4405 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4409 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4413 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4418 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4419 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4421 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4422 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4424 #define CLIPPEDVERTEXCOPY(k,p1) \
4425 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4427 #define GENATTRIBCOPY(attrib, p1) \
4428 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4429 #define GENATTRIBLERP(attrib, p1, p2) \
4431 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4432 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4434 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4438 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4439 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4440 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4441 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4442 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4443 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4444 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4450 // calculate distance from nearplane
4451 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4452 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4453 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4454 if (clipdist[0] >= 0.0f)
4456 if (clipdist[1] >= 0.0f)
4458 if (clipdist[2] >= 0.0f)
4461 // triangle is entirely in front of nearplane
4462 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4469 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4477 if (clipdist[2] >= 0.0f)
4479 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4486 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4493 else if (clipdist[1] >= 0.0f)
4495 if (clipdist[2] >= 0.0f)
4497 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4504 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4510 else if (clipdist[2] >= 0.0f)
4512 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4517 else continue; // triangle is entirely behind nearplane
4520 // calculate integer y coords for triangle points
4521 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4522 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4523 screenmin = _mm_min_epi16(screeni, screenir),
4524 screenmax = _mm_max_epi16(screeni, screenir);
4525 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4526 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4527 screenmin = _mm_max_epi16(screenmin, fbmin);
4528 screenmax = _mm_min_epi16(screenmax, fbmax);
4529 // skip offscreen triangles
4530 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4532 starty = _mm_extract_epi16(screenmin, 1);
4533 endy = _mm_extract_epi16(screenmax, 1)+1;
4534 if (starty >= maxy1 && endy <= miny2)
4536 screeny = _mm_srai_epi32(screeni, 16);
4539 triangle = &thread->triangles[thread->numtriangles];
4541 // calculate attribute plans for triangle data...
4542 // okay, this triangle is going to produce spans, we'd better project
4543 // the interpolants now (this is what gives perspective texturing),
4544 // this consists of simply multiplying all arrays by the W coord
4545 // (which is basically 1/Z), which will be undone per-pixel
4546 // (multiplying by Z again) to get the perspective-correct array
4549 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4550 __m128 mipedgescale, mipdensity;
4551 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4552 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4553 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4554 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4555 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4556 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4557 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4558 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4559 attribedge1 = _mm_sub_ss(w0, w1);
4560 attribedge2 = _mm_sub_ss(w2, w1);
4561 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4562 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4563 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4564 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4565 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4566 _mm_store_ss(&triangle->w[0], attribxslope);
4567 _mm_store_ss(&triangle->w[1], attribyslope);
4568 _mm_store_ss(&triangle->w[2], attriborigin);
4569 mipedgescale = _mm_setzero_ps();
4570 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4572 __m128 attrib0, attrib1, attrib2;
4573 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4574 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4576 arrays += numvertices*4;
4577 GENATTRIBS(attrib0, attrib1, attrib2);
4578 attriborigin = _mm_mul_ps(attrib1, w1);
4579 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4580 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4581 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4582 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4583 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4584 _mm_stream_ps(triangle->attribs[k][0], attribxslope);
4585 _mm_stream_ps(triangle->attribs[k][1], attribyslope);
4586 _mm_stream_ps(triangle->attribs[k][2], attriborigin);
4587 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4589 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4590 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4591 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4592 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4596 memset(triangle->mip, 0, sizeof(triangle->mip));
4597 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4599 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4600 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4602 texture = thread->texbound[texunit];
4603 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4605 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4606 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4607 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4608 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4609 // this will be multiplied in the texturing routine by the texture resolution
4610 y = _mm_cvtss_si32(mipdensity);
4613 y = (int)(log((float)y)*0.5f/M_LN2);
4614 if (y > texture->mipmaps - 1)
4615 y = texture->mipmaps - 1;
4616 triangle->mip[texunit] = y;
4622 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4625 __m128 xcoords, xslope;
4626 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4627 int yccmask = _mm_movemask_epi8(ycc);
4628 int edge0p, edge0n, edge1p, edge1n;
4635 case 0xFFFF: /*0000*/ y = endy; continue;
4636 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4637 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4638 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4639 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4640 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4641 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4642 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4643 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4644 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4645 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4646 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4647 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4648 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4649 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4650 case 0x0000: /*1111*/ y++; continue;
4658 case 0xFFFF: /*000*/ y = endy; continue;
4659 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4660 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4661 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4662 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4663 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4664 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4665 case 0x0000: /*111*/ y++; continue;
4668 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
4669 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4670 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4671 nexty = _mm_extract_epi16(ycc, 0);
4672 if (nexty >= bandy) nexty = bandy-1;
4673 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
4674 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4675 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
4676 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4677 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4678 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
4680 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
4681 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
4683 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4685 int startx, endx, offset;
4686 startx = _mm_cvtss_si32(xcoords);
4687 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4690 if (startx < 0) startx = 0;
4691 startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
4693 if (endx > maxx) endx = maxx;
4694 if (startx >= endx) continue;
4695 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
4697 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4698 span->triangle = thread->numtriangles;
4701 span->startx = max(minx - offset, 0);
4702 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
4703 if (span->startx >= span->endx)
4705 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4706 DPSOFTRAST_Draw_ProcessSpans(thread);
4711 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
4713 DPSOFTRAST_Draw_ProcessSpans(thread);
4714 thread->numtriangles = 0;
4718 if (!ATOMIC_DECREMENT(command->refcount))
4720 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4721 MM_FREE(command->arrays);
4724 if (thread->numspans > 0 || thread->numtriangles > 0)
4726 DPSOFTRAST_Draw_ProcessSpans(thread);
4727 thread->numtriangles = 0;
4732 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4736 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
4737 int datasize = 2*numvertices*sizeof(float[4]);
4738 DPSOFTRAST_Command_Draw *command;
4739 unsigned char *data;
4740 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4742 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4743 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4745 datasize += numvertices*sizeof(float[4]);
4748 datasize += numtriangles*sizeof(unsigned short[3]);
4750 datasize += numtriangles*sizeof(int[3]);
4751 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
4752 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
4754 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
4755 data = (unsigned char *)MM_CALLOC(datasize, 1);
4759 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
4760 data = (unsigned char *)command + commandsize;
4762 command->firstvertex = firstvertex;
4763 command->numvertices = numvertices;
4764 command->numtriangles = numtriangles;
4765 command->arrays = (float *)data;
4766 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
4767 dpsoftrast.firstvertex = firstvertex;
4768 dpsoftrast.numvertices = numvertices;
4769 dpsoftrast.screencoord4f = (float *)data;
4770 data += numvertices*sizeof(float[4]);
4771 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
4772 data += numvertices*sizeof(float[4]);
4773 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4775 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4776 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4778 dpsoftrast.post_array4f[j] = (float *)data;
4779 data += numvertices*sizeof(float[4]);
4781 command->element3i = NULL;
4782 command->element3s = NULL;
4785 command->element3s = (unsigned short *)data;
4786 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
4790 command->element3i = (int *)data;
4791 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
4796 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4798 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
4799 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4800 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
4801 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
4802 if (command->starty >= command->endy)
4804 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4805 MM_FREE(command->arrays);
4806 DPSOFTRAST_UndoCommand(command->commandsize);
4809 command->clipped = dpsoftrast.drawclipped;
4810 command->refcount = dpsoftrast.numthreads;
4812 if (dpsoftrast.usethreads)
4815 DPSOFTRAST_Draw_SyncCommands();
4816 for (i = 0; i < dpsoftrast.numthreads; i++)
4818 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4819 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
4820 Thread_CondSignal(thread->drawcond);
4825 DPSOFTRAST_Draw_FlushThreads();
4829 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
4831 int commandoffset = thread->commandoffset;
4832 while (commandoffset != endoffset)
4834 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4835 switch (command->opcode)
4837 #define INTERPCOMMAND(name) \
4838 case DPSOFTRAST_OPCODE_##name : \
4839 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4840 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
4841 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4842 commandoffset = 0; \
4844 INTERPCOMMAND(Viewport)
4845 INTERPCOMMAND(ClearColor)
4846 INTERPCOMMAND(ClearDepth)
4847 INTERPCOMMAND(ColorMask)
4848 INTERPCOMMAND(DepthTest)
4849 INTERPCOMMAND(ScissorTest)
4850 INTERPCOMMAND(Scissor)
4851 INTERPCOMMAND(BlendFunc)
4852 INTERPCOMMAND(BlendSubtract)
4853 INTERPCOMMAND(DepthMask)
4854 INTERPCOMMAND(DepthFunc)
4855 INTERPCOMMAND(DepthRange)
4856 INTERPCOMMAND(PolygonOffset)
4857 INTERPCOMMAND(CullFace)
4858 INTERPCOMMAND(AlphaTest)
4859 INTERPCOMMAND(AlphaFunc)
4860 INTERPCOMMAND(SetTexture)
4861 INTERPCOMMAND(SetShader)
4862 INTERPCOMMAND(Uniform4f)
4863 INTERPCOMMAND(UniformMatrix4f)
4864 INTERPCOMMAND(Uniform1i)
4866 case DPSOFTRAST_OPCODE_Draw:
4867 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
4868 commandoffset += command->commandsize;
4869 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
4871 thread->commandoffset = commandoffset;
4874 case DPSOFTRAST_OPCODE_Reset:
4879 thread->commandoffset = commandoffset;
4882 static int DPSOFTRAST_Draw_Thread(void *data)
4884 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4885 while(thread->index >= 0)
4887 if (thread->commandoffset != dpsoftrast.drawcommand)
4889 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4893 Thread_LockMutex(thread->drawmutex);
4894 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
4896 if (thread->waiting) Thread_CondSignal(thread->waitcond);
4897 thread->starving = true;
4898 Thread_CondWait(thread->drawcond, thread->drawmutex);
4899 thread->starving = false;
4901 Thread_UnlockMutex(thread->drawmutex);
4907 static void DPSOFTRAST_Draw_FlushThreads(void)
4909 DPSOFTRAST_State_Thread *thread;
4911 DPSOFTRAST_Draw_SyncCommands();
4912 if (dpsoftrast.usethreads)
4914 for (i = 0; i < dpsoftrast.numthreads; i++)
4916 thread = &dpsoftrast.threads[i];
4917 if (thread->commandoffset != dpsoftrast.drawcommand)
4919 Thread_LockMutex(thread->drawmutex);
4920 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
4921 Thread_CondSignal(thread->drawcond);
4922 Thread_UnlockMutex(thread->drawmutex);
4925 for (i = 0; i < dpsoftrast.numthreads; i++)
4927 thread = &dpsoftrast.threads[i];
4928 if (thread->commandoffset != dpsoftrast.drawcommand)
4930 Thread_LockMutex(thread->drawmutex);
4931 if (thread->commandoffset != dpsoftrast.drawcommand)
4933 thread->waiting = true;
4934 Thread_CondWait(thread->waitcond, thread->drawmutex);
4935 thread->waiting = false;
4937 Thread_UnlockMutex(thread->drawmutex);
4943 for (i = 0; i < dpsoftrast.numthreads; i++)
4945 thread = &dpsoftrast.threads[i];
4946 if (thread->commandoffset != dpsoftrast.drawcommand)
4947 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4950 dpsoftrast.commandpool.usedcommands = 0;
4953 void DPSOFTRAST_Flush(void)
4955 DPSOFTRAST_Draw_FlushThreads();
4958 void DPSOFTRAST_Finish(void)
4963 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
4973 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4974 dpsoftrast.bigendian = u.b[3];
4975 dpsoftrast.fb_width = width;
4976 dpsoftrast.fb_height = height;
4977 dpsoftrast.fb_depthpixels = depthpixels;
4978 dpsoftrast.fb_colorpixels[0] = colorpixels;
4979 dpsoftrast.fb_colorpixels[1] = NULL;
4980 dpsoftrast.fb_colorpixels[1] = NULL;
4981 dpsoftrast.fb_colorpixels[1] = NULL;
4982 dpsoftrast.viewport[0] = 0;
4983 dpsoftrast.viewport[1] = 0;
4984 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
4985 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
4986 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
4987 dpsoftrast.texture_firstfree = 1;
4988 dpsoftrast.texture_end = 1;
4989 dpsoftrast.texture_max = 0;
4990 dpsoftrast.color[0] = 1;
4991 dpsoftrast.color[1] = 1;
4992 dpsoftrast.color[2] = 1;
4993 dpsoftrast.color[3] = 1;
4994 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
4995 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
4996 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
4997 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
4998 for (i = 0; i < dpsoftrast.numthreads; i++)
5000 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5002 thread->cullface = GL_BACK;
5003 thread->colormask[1] = 1;
5004 thread->colormask[2] = 1;
5005 thread->colormask[3] = 1;
5006 thread->blendfunc[0] = GL_ONE;
5007 thread->blendfunc[1] = GL_ZERO;
5008 thread->depthmask = true;
5009 thread->depthtest = true;
5010 thread->depthfunc = GL_LEQUAL;
5011 thread->scissortest = false;
5012 thread->alphatest = false;
5013 thread->alphafunc = GL_GREATER;
5014 thread->alphavalue = 0.5f;
5015 thread->viewport[0] = 0;
5016 thread->viewport[1] = 0;
5017 thread->viewport[2] = dpsoftrast.fb_width;
5018 thread->viewport[3] = dpsoftrast.fb_height;
5019 thread->scissor[0] = 0;
5020 thread->scissor[1] = 0;
5021 thread->scissor[2] = dpsoftrast.fb_width;
5022 thread->scissor[3] = dpsoftrast.fb_height;
5023 thread->depthrange[0] = 0;
5024 thread->depthrange[1] = 1;
5025 thread->polygonoffset[0] = 0;
5026 thread->polygonoffset[1] = 0;
5028 if (dpsoftrast.interlace)
5030 thread->miny1 = (i*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5031 thread->maxy1 = ((i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5032 thread->miny2 = ((dpsoftrast.numthreads+i)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5033 thread->maxy2 = ((dpsoftrast.numthreads+i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5037 thread->miny1 = thread->miny2 = (i*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5038 thread->maxy1 = thread->maxy2 = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5041 thread->numspans = 0;
5042 thread->numtriangles = 0;
5043 thread->commandoffset = 0;
5044 thread->waiting = false;
5045 thread->starving = false;
5047 thread->validate = -1;
5048 DPSOFTRAST_Validate(thread, -1);
5050 if (dpsoftrast.usethreads)
5052 thread->waitcond = Thread_CreateCond();
5053 thread->drawcond = Thread_CreateCond();
5054 thread->drawmutex = Thread_CreateMutex();
5055 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5061 void DPSOFTRAST_Shutdown(void)
5064 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5066 DPSOFTRAST_State_Thread *thread;
5067 for (i = 0; i < dpsoftrast.numthreads; i++)
5069 thread = &dpsoftrast.threads[i];
5070 Thread_LockMutex(thread->drawmutex);
5072 Thread_CondSignal(thread->drawcond);
5073 Thread_UnlockMutex(thread->drawmutex);
5074 Thread_WaitThread(thread->thread, 0);
5075 Thread_DestroyCond(thread->waitcond);
5076 Thread_DestroyCond(thread->drawcond);
5077 Thread_DestroyMutex(thread->drawmutex);
5080 for (i = 0;i < dpsoftrast.texture_end;i++)
5081 if (dpsoftrast.texture[i].bytes)
5082 MM_FREE(dpsoftrast.texture[i].bytes);
5083 if (dpsoftrast.texture)
5084 free(dpsoftrast.texture);
5085 if (dpsoftrast.threads)
5086 MM_FREE(dpsoftrast.threads);
5087 memset(&dpsoftrast, 0, sizeof(dpsoftrast));