3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 typedef qboolean bool;
14 #define ATOMIC_SIZE 32
17 #if defined(__APPLE__)
18 #include <libkern/OSAtomic.h>
19 #define ALIGN(var) var __attribute__((__aligned__(16)))
20 #define ATOMIC(var) var __attribute__((__aligned__(32)))
21 #define MEMORY_BARRIER (_mm_sfence())
22 #define ATOMIC_COUNTER volatile int32_t
23 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
24 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
25 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
26 #elif defined(__GNUC__)
27 #define ALIGN(var) var __attribute__((__aligned__(16)))
28 #define ATOMIC(var) var __attribute__((__aligned__(32)))
29 #define MEMORY_BARRIER (_mm_sfence())
30 //(__sync_synchronize())
31 #define ATOMIC_COUNTER volatile int
32 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
33 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
34 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
35 #elif defined(_MSC_VER)
36 #define ALIGN(var) __declspec(align(16)) var
37 #define ATOMIC(var) __declspec(align(32)) var
38 #define MEMORY_BARRIER (_mm_sfence())
40 #define ATOMIC_COUNTER volatile LONG
41 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
42 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
43 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
48 #define ALIGN(var) var
51 #define ATOMIC(var) var
53 #ifndef MEMORY_BARRIER
54 #define MEMORY_BARRIER ((void)0)
56 #ifndef ATOMIC_COUNTER
57 #define ATOMIC_COUNTER int
59 #ifndef ATOMIC_INCREMENT
60 #define ATOMIC_INCREMENT(counter) (++(counter))
62 #ifndef ATOMIC_DECREMENT
63 #define ATOMIC_DECREMENT(counter) (--(counter))
66 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
70 #include <emmintrin.h>
72 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
74 static void *MM_CALLOC(size_t nmemb, size_t size)
76 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
77 if (ptr != NULL) memset(ptr, 0, nmemb*size);
81 #define MM_FREE _mm_free
83 #define MM_MALLOC(size) malloc(size)
84 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
88 typedef enum DPSOFTRAST_ARRAY_e
90 DPSOFTRAST_ARRAY_POSITION,
91 DPSOFTRAST_ARRAY_COLOR,
92 DPSOFTRAST_ARRAY_TEXCOORD0,
93 DPSOFTRAST_ARRAY_TEXCOORD1,
94 DPSOFTRAST_ARRAY_TEXCOORD2,
95 DPSOFTRAST_ARRAY_TEXCOORD3,
96 DPSOFTRAST_ARRAY_TEXCOORD4,
97 DPSOFTRAST_ARRAY_TEXCOORD5,
98 DPSOFTRAST_ARRAY_TEXCOORD6,
99 DPSOFTRAST_ARRAY_TEXCOORD7,
100 DPSOFTRAST_ARRAY_TOTAL
104 typedef struct DPSOFTRAST_Texture_s
111 DPSOFTRAST_TEXTURE_FILTER filter;
114 ATOMIC_COUNTER binds;
115 unsigned char *bytes;
116 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
120 #define COMMAND_SIZE ALIGN_SIZE
121 #define COMMAND_ALIGN(var) ALIGN(var)
123 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
125 unsigned char opcode;
126 unsigned short commandsize;
130 enum { DPSOFTRAST_OPCODE_Reset = 0 };
132 #define DEFCOMMAND(opcodeval, name, fields) \
133 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
134 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
136 unsigned char opcode; \
137 unsigned short commandsize; \
139 } DPSOFTRAST_Command_##name );
141 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
142 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
144 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
148 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
150 DPSOFTRAST_State_Command_Pool);
152 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
154 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
156 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
158 DPSOFTRAST_State_Triangle);
160 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
161 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
162 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
163 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
164 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
166 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
167 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
168 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
169 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
170 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
171 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
172 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
173 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
174 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
177 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
179 typedef ALIGN(struct DPSOFTRAST_State_Span_s
181 int triangle; // triangle this span was generated by
182 int x; // framebuffer x coord
183 int y; // framebuffer y coord
184 int startx; // usable range (according to pixelmask)
185 int endx; // usable range (according to pixelmask)
186 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
188 DPSOFTRAST_State_Span);
190 #define DPSOFTRAST_DRAW_MAXSPANS 1024
191 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
193 #define DPSOFTRAST_VALIDATE_FB 1
194 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
195 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
196 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
198 typedef enum DPSOFTRAST_BLENDMODE_e
200 DPSOFTRAST_BLENDMODE_OPAQUE,
201 DPSOFTRAST_BLENDMODE_ALPHA,
202 DPSOFTRAST_BLENDMODE_ADDALPHA,
203 DPSOFTRAST_BLENDMODE_ADD,
204 DPSOFTRAST_BLENDMODE_INVMOD,
205 DPSOFTRAST_BLENDMODE_MUL,
206 DPSOFTRAST_BLENDMODE_MUL2,
207 DPSOFTRAST_BLENDMODE_SUBALPHA,
208 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
209 DPSOFTRAST_BLENDMODE_INVADD,
210 DPSOFTRAST_BLENDMODE_TOTAL
212 DPSOFTRAST_BLENDMODE;
214 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
233 float polygonoffset[2];
236 int shader_permutation;
237 int shader_exactspecularmath;
239 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
241 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
242 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
244 // DPSOFTRAST_VALIDATE_ flags
247 // derived values (DPSOFTRAST_VALIDATE_FB)
250 ALIGN(float fb_viewportcenter[4]);
251 ALIGN(float fb_viewportscale[4]);
253 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
256 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
265 ATOMIC(volatile int commandoffset);
267 volatile bool waiting;
268 volatile bool starving;
275 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
276 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
278 DPSOFTRAST_State_Thread);
280 typedef ATOMIC(struct DPSOFTRAST_State_s
284 unsigned int *fb_depthpixels;
285 unsigned int *fb_colorpixels[4];
288 ALIGN(float fb_viewportcenter[4]);
289 ALIGN(float fb_viewportscale[4]);
292 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
293 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
295 const float *pointer_vertex3f;
296 const float *pointer_color4f;
297 const unsigned char *pointer_color4ub;
298 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
301 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
302 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
303 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
307 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
308 float *screencoord4f;
314 int shader_permutation;
315 int shader_exactspecularmath;
319 int texture_firstfree;
320 DPSOFTRAST_Texture *texture;
325 const char *errorstring;
330 DPSOFTRAST_State_Thread *threads;
332 ATOMIC(volatile int drawcommand);
334 DPSOFTRAST_State_Command_Pool commandpool;
338 DPSOFTRAST_State dpsoftrast;
340 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
341 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
342 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
343 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
344 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
346 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
348 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
349 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
350 fb_viewportcenter[3] = 0.5f;
351 fb_viewportcenter[0] = 0.0f;
352 fb_viewportscale[1] = 0.5f * viewport[2];
353 fb_viewportscale[2] = -0.5f * viewport[3];
354 fb_viewportscale[3] = 0.5f;
355 fb_viewportscale[0] = 1.0f;
358 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
360 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
361 // and viewport projection values
364 x1 = thread->scissor[0];
365 x2 = thread->scissor[0] + thread->scissor[2];
366 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
367 y2 = dpsoftrast.fb_height - thread->scissor[1];
368 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
370 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
372 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
373 thread->fb_scissor[0] = x1;
374 thread->fb_scissor[1] = y1;
375 thread->fb_scissor[2] = x2 - x1;
376 thread->fb_scissor[3] = y2 - y1;
378 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
381 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
383 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
386 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
388 if (thread->blendsubtract)
390 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
392 #define BLENDFUNC(sfactor, dfactor, blendmode) \
393 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
394 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
395 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
400 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
402 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
403 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
404 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
405 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
406 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
407 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
408 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
409 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
410 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
411 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
412 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
417 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
419 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
421 mask &= thread->validate;
424 if (mask & DPSOFTRAST_VALIDATE_FB)
426 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
427 DPSOFTRAST_RecalcFB(thread);
429 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
431 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
432 DPSOFTRAST_RecalcDepthFunc(thread);
434 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
436 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
437 DPSOFTRAST_RecalcBlendFunc(thread);
441 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
443 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
444 return &dpsoftrast.texture[index];
448 static void DPSOFTRAST_Texture_Grow(void)
450 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
451 DPSOFTRAST_State_Thread *thread;
455 // expand texture array as needed
456 if (dpsoftrast.texture_max < 1024)
457 dpsoftrast.texture_max = 1024;
459 dpsoftrast.texture_max *= 2;
460 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
461 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
462 if (dpsoftrast.texbound[i])
463 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
464 for (j = 0; j < dpsoftrast.numthreads; j++)
466 thread = &dpsoftrast.threads[j];
467 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
468 if (thread->texbound[i])
469 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
473 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
482 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
483 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
484 DPSOFTRAST_Texture *texture;
485 if (width*height*depth < 1)
487 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
490 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
492 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
497 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
498 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
499 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
501 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
502 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
504 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
509 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
512 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
514 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
519 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
521 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
524 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
526 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
529 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
531 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
534 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
536 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
539 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
541 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
544 // find first empty slot in texture array
545 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
546 if (!dpsoftrast.texture[texnum].bytes)
548 dpsoftrast.texture_firstfree = texnum + 1;
549 if (dpsoftrast.texture_max <= texnum)
550 DPSOFTRAST_Texture_Grow();
551 if (dpsoftrast.texture_end <= texnum)
552 dpsoftrast.texture_end = texnum + 1;
553 texture = &dpsoftrast.texture[texnum];
554 memset(texture, 0, sizeof(*texture));
555 texture->flags = flags;
556 texture->width = width;
557 texture->height = height;
558 texture->depth = depth;
559 texture->sides = sides;
571 s = w * h * d * sides * 4;
572 texture->mipmap[mipmaps][0] = size;
573 texture->mipmap[mipmaps][1] = s;
574 texture->mipmap[mipmaps][2] = w;
575 texture->mipmap[mipmaps][3] = h;
576 texture->mipmap[mipmaps][4] = d;
579 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
585 texture->mipmaps = mipmaps;
586 texture->size = size;
588 // allocate the pixels now
589 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
593 void DPSOFTRAST_Texture_Free(int index)
595 DPSOFTRAST_Texture *texture;
596 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
600 MM_FREE(texture->bytes);
601 texture->bytes = NULL;
602 memset(texture, 0, sizeof(*texture));
603 // adjust the free range and used range
604 if (dpsoftrast.texture_firstfree > index)
605 dpsoftrast.texture_firstfree = index;
606 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
607 dpsoftrast.texture_end--;
609 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
611 int i, x, y, z, w, layer0, layer1, row0, row1;
612 unsigned char *o, *i0, *i1, *i2, *i3;
613 DPSOFTRAST_Texture *texture;
614 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
615 if (texture->mipmaps <= 1)
617 for (i = 1;i < texture->mipmaps;i++)
619 for (z = 0;z < texture->mipmap[i][4];z++)
623 if (layer1 >= texture->mipmap[i-1][4])
624 layer1 = texture->mipmap[i-1][4]-1;
625 for (y = 0;y < texture->mipmap[i][3];y++)
629 if (row1 >= texture->mipmap[i-1][3])
630 row1 = texture->mipmap[i-1][3]-1;
631 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
632 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
633 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
634 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
635 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
636 w = texture->mipmap[i][2];
639 if (texture->mipmap[i-1][2] > 1)
641 // average 3D texture
642 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
644 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
645 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
646 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
647 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
652 // average 3D mipmap with parent width == 1
653 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
655 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
656 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
657 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
658 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
664 if (texture->mipmap[i-1][2] > 1)
666 // average 2D texture (common case)
667 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
669 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
670 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
671 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
672 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
677 // 2D texture with parent width == 1
678 o[0] = (i0[0] + i1[0] + 1) >> 1;
679 o[1] = (i0[1] + i1[1] + 1) >> 1;
680 o[2] = (i0[2] + i1[2] + 1) >> 1;
681 o[3] = (i0[3] + i1[3] + 1) >> 1;
688 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
690 DPSOFTRAST_Texture *texture;
692 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
697 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
698 while (blockheight > 0)
700 memcpy(dst, pixels, blockwidth * 4);
701 pixels += blockwidth * 4;
702 dst += texture->mipmap[0][2] * 4;
706 DPSOFTRAST_Texture_CalculateMipmaps(index);
708 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
710 DPSOFTRAST_Texture *texture;
711 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
715 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
716 DPSOFTRAST_Texture_CalculateMipmaps(index);
718 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
720 DPSOFTRAST_Texture *texture;
721 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
722 return texture->mipmap[mip][2];
724 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
726 DPSOFTRAST_Texture *texture;
727 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
728 return texture->mipmap[mip][3];
730 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
732 DPSOFTRAST_Texture *texture;
733 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
734 return texture->mipmap[mip][4];
736 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
738 DPSOFTRAST_Texture *texture;
739 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
742 return texture->bytes + texture->mipmap[mip][0];
744 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
746 DPSOFTRAST_Texture *texture;
747 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
748 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
750 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
755 texture->filter = filter;
758 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
760 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
761 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
762 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
764 dpsoftrast.fb_width = width;
765 dpsoftrast.fb_height = height;
766 dpsoftrast.fb_depthpixels = depthpixels;
767 dpsoftrast.fb_colorpixels[0] = colorpixels0;
768 dpsoftrast.fb_colorpixels[1] = colorpixels1;
769 dpsoftrast.fb_colorpixels[2] = colorpixels2;
770 dpsoftrast.fb_colorpixels[3] = colorpixels3;
773 static void DPSOFTRAST_Draw_FlushThreads(void);
775 static void DPSOFTRAST_Draw_SyncCommands(void)
777 if(dpsoftrast.usethreads) MEMORY_BARRIER;
778 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
781 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
783 DPSOFTRAST_State_Thread *thread;
785 int freecommand = dpsoftrast.commandpool.freecommand;
786 int usedcommands = dpsoftrast.commandpool.usedcommands;
787 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
789 DPSOFTRAST_Draw_SyncCommands();
795 for (i = 0; i < dpsoftrast.numthreads; i++)
797 thread = &dpsoftrast.threads[i];
798 commandoffset = freecommand - thread->commandoffset;
799 if (commandoffset < 0)
800 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
801 if (commandoffset > usedcommands)
804 usedcommands = commandoffset;
807 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
809 thread = &dpsoftrast.threads[waitindex];
810 Thread_LockMutex(thread->drawmutex);
811 if (thread->commandoffset != dpsoftrast.drawcommand)
813 thread->waiting = true;
814 if (thread->starving) Thread_CondSignal(thread->drawcond);
815 Thread_CondWait(thread->waitcond, thread->drawmutex);
816 thread->waiting = false;
818 Thread_UnlockMutex(thread->drawmutex);
820 dpsoftrast.commandpool.usedcommands = usedcommands;
823 #define DPSOFTRAST_ALIGNCOMMAND(size) \
824 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
825 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
826 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
828 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
830 DPSOFTRAST_Command *command;
831 int freecommand = dpsoftrast.commandpool.freecommand;
832 int usedcommands = dpsoftrast.commandpool.usedcommands;
833 int extra = sizeof(DPSOFTRAST_Command);
834 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
835 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
836 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
838 if (dpsoftrast.usethreads)
839 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
841 DPSOFTRAST_Draw_FlushThreads();
842 freecommand = dpsoftrast.commandpool.freecommand;
843 usedcommands = dpsoftrast.commandpool.usedcommands;
845 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
847 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
848 command->opcode = DPSOFTRAST_OPCODE_Reset;
849 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
852 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
853 command->opcode = opcode;
854 command->commandsize = size;
856 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
858 dpsoftrast.commandpool.freecommand = freecommand;
859 dpsoftrast.commandpool.usedcommands = usedcommands + size;
863 static void DPSOFTRAST_UndoCommand(int size)
865 int freecommand = dpsoftrast.commandpool.freecommand;
866 int usedcommands = dpsoftrast.commandpool.usedcommands;
869 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
870 usedcommands -= size;
871 dpsoftrast.commandpool.freecommand = freecommand;
872 dpsoftrast.commandpool.usedcommands = usedcommands;
875 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
876 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
878 thread->viewport[0] = command->x;
879 thread->viewport[1] = command->y;
880 thread->viewport[2] = command->width;
881 thread->viewport[3] = command->height;
882 thread->validate |= DPSOFTRAST_VALIDATE_FB;
884 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
886 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
889 command->width = width;
890 command->height = height;
892 dpsoftrast.viewport[0] = x;
893 dpsoftrast.viewport[1] = y;
894 dpsoftrast.viewport[2] = width;
895 dpsoftrast.viewport[3] = height;
896 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
899 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
900 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
902 int i, x1, y1, x2, y2, w, h, x, y;
903 int miny1 = thread->miny1;
904 int maxy1 = thread->maxy1;
905 int miny2 = thread->miny2;
906 int maxy2 = thread->maxy2;
910 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
911 x1 = thread->fb_scissor[0];
912 y1 = thread->fb_scissor[1];
913 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
914 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
915 if (y1 < miny1) y1 = miny1;
916 if (y2 > maxy2) y2 = maxy2;
921 // FIXME: honor fb_colormask?
922 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
923 for (i = 0;i < 4;i++)
925 if (!dpsoftrast.fb_colorpixels[i])
927 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
930 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
931 for (x = x1;x < x2;x++)
936 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
938 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
945 DEFCOMMAND(3, ClearDepth, float depth;)
946 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
948 int x1, y1, x2, y2, w, h, x, y;
949 int miny1 = thread->miny1;
950 int maxy1 = thread->maxy1;
951 int miny2 = thread->miny2;
952 int maxy2 = thread->maxy2;
956 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
957 x1 = thread->fb_scissor[0];
958 y1 = thread->fb_scissor[1];
959 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
960 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
961 if (y1 < miny1) y1 = miny1;
962 if (y2 > maxy2) y2 = maxy2;
967 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
968 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
971 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
972 for (x = x1;x < x2;x++)
976 void DPSOFTRAST_ClearDepth(float d)
978 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
982 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
983 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
985 thread->colormask[0] = command->r != 0;
986 thread->colormask[1] = command->g != 0;
987 thread->colormask[2] = command->b != 0;
988 thread->colormask[3] = command->a != 0;
989 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
991 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
993 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1000 DEFCOMMAND(5, DepthTest, int enable;)
1001 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1003 thread->depthtest = command->enable;
1004 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1006 void DPSOFTRAST_DepthTest(int enable)
1008 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1009 command->enable = enable;
1012 DEFCOMMAND(6, ScissorTest, int enable;)
1013 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1015 thread->scissortest = command->enable;
1016 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1018 void DPSOFTRAST_ScissorTest(int enable)
1020 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1021 command->enable = enable;
1024 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1025 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1027 thread->scissor[0] = command->x;
1028 thread->scissor[1] = command->y;
1029 thread->scissor[2] = command->width;
1030 thread->scissor[3] = command->height;
1031 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1033 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1035 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1038 command->width = width;
1039 command->height = height;
1042 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1043 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1045 thread->blendfunc[0] = command->sfactor;
1046 thread->blendfunc[1] = command->dfactor;
1047 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1049 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1051 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1052 command->sfactor = sfactor;
1053 command->dfactor = dfactor;
1056 DEFCOMMAND(9, BlendSubtract, int enable;)
1057 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1059 thread->blendsubtract = command->enable;
1060 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1062 void DPSOFTRAST_BlendSubtract(int enable)
1064 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1065 command->enable = enable;
1068 DEFCOMMAND(10, DepthMask, int enable;)
1069 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1071 thread->depthmask = command->enable;
1073 void DPSOFTRAST_DepthMask(int enable)
1075 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1076 command->enable = enable;
1079 DEFCOMMAND(11, DepthFunc, int func;)
1080 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1082 thread->depthfunc = command->func;
1084 void DPSOFTRAST_DepthFunc(int func)
1086 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1087 command->func = func;
1090 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1091 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1093 thread->depthrange[0] = command->nearval;
1094 thread->depthrange[1] = command->farval;
1096 void DPSOFTRAST_DepthRange(float nearval, float farval)
1098 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1099 command->nearval = nearval;
1100 command->farval = farval;
1103 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1104 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1106 thread->polygonoffset[0] = command->alongnormal;
1107 thread->polygonoffset[1] = command->intoview;
1109 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1111 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1112 command->alongnormal = alongnormal;
1113 command->intoview = intoview;
1116 DEFCOMMAND(14, CullFace, int mode;)
1117 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1119 thread->cullface = command->mode;
1121 void DPSOFTRAST_CullFace(int mode)
1123 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1124 command->mode = mode;
1127 DEFCOMMAND(15, AlphaTest, int enable;)
1128 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1130 thread->alphatest = command->enable;
1132 void DPSOFTRAST_AlphaTest(int enable)
1134 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1135 command->enable = enable;
1138 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1139 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1141 thread->alphafunc = command->func;
1142 thread->alphavalue = command->ref;
1144 void DPSOFTRAST_AlphaFunc(int func, float ref)
1146 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1147 command->func = func;
1151 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1153 dpsoftrast.color[0] = r;
1154 dpsoftrast.color[1] = g;
1155 dpsoftrast.color[2] = b;
1156 dpsoftrast.color[3] = a;
1159 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1161 int outstride = blockwidth * 4;
1162 int instride = dpsoftrast.fb_width * 4;
1165 int bx2 = blockx + blockwidth;
1166 int by2 = blocky + blockheight;
1170 unsigned char *inpixels;
1174 if (bx1 < 0) bx1 = 0;
1175 if (by1 < 0) by1 = 0;
1176 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1177 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1179 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1180 if (dpsoftrast.bigendian)
1182 for (y = by1;y < by2;y++)
1184 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1185 o = (unsigned char *)outpixels + (y - by1) * outstride;
1186 for (x = bx1;x < bx2;x++)
1199 for (y = by1;y < by2;y++)
1201 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1202 o = (unsigned char *)outpixels + (y - by1) * outstride;
1208 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1212 int tx2 = tx + width;
1213 int ty2 = ty + height;
1216 int sx2 = sx + width;
1217 int sy2 = sy + height;
1227 unsigned int *spixels;
1228 unsigned int *tpixels;
1229 DPSOFTRAST_Texture *texture;
1230 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1231 if (mip < 0 || mip >= texture->mipmaps) return;
1233 spixels = dpsoftrast.fb_colorpixels[0];
1234 swidth = dpsoftrast.fb_width;
1235 sheight = dpsoftrast.fb_height;
1236 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1237 twidth = texture->mipmap[mip][2];
1238 theight = texture->mipmap[mip][3];
1239 if (tx1 < 0) tx1 = 0;
1240 if (ty1 < 0) ty1 = 0;
1241 if (tx2 > twidth) tx2 = twidth;
1242 if (ty2 > theight) ty2 = theight;
1243 if (sx1 < 0) sx1 = 0;
1244 if (sy1 < 0) sy1 = 0;
1245 if (sx2 > swidth) sx2 = swidth;
1246 if (sy2 > sheight) sy2 = sheight;
1251 if (tw > sw) tw = sw;
1252 if (th > sh) th = sh;
1253 if (tw < 1 || th < 1)
1255 sy1 = sheight - 1 - sy1;
1256 for (y = 0;y < th;y++)
1257 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1258 if (texture->mipmaps > 1)
1259 DPSOFTRAST_Texture_CalculateMipmaps(index);
1262 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1263 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1265 if (thread->texbound[command->unitnum])
1266 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1267 thread->texbound[command->unitnum] = command->texture;
1269 void DPSOFTRAST_SetTexture(int unitnum, int index)
1271 DPSOFTRAST_Command_SetTexture *command;
1272 DPSOFTRAST_Texture *texture;
1273 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1275 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1278 texture = DPSOFTRAST_Texture_GetByIndex(index);
1279 if (index && !texture)
1281 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1285 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1286 command->unitnum = unitnum;
1287 command->texture = texture;
1289 dpsoftrast.texbound[unitnum] = texture;
1290 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1293 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1295 dpsoftrast.pointer_vertex3f = vertex3f;
1296 dpsoftrast.stride_vertex = stride;
1298 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1300 dpsoftrast.pointer_color4f = color4f;
1301 dpsoftrast.pointer_color4ub = NULL;
1302 dpsoftrast.stride_color = stride;
1304 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1306 dpsoftrast.pointer_color4f = NULL;
1307 dpsoftrast.pointer_color4ub = color4ub;
1308 dpsoftrast.stride_color = stride;
1310 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1312 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1313 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1314 dpsoftrast.stride_texcoord[unitnum] = stride;
1317 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1318 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1320 thread->shader_mode = command->mode;
1321 thread->shader_permutation = command->permutation;
1322 thread->shader_exactspecularmath = command->exactspecularmath;
1324 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1326 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1327 command->mode = mode;
1328 command->permutation = permutation;
1329 command->exactspecularmath = exactspecularmath;
1331 dpsoftrast.shader_mode = mode;
1332 dpsoftrast.shader_permutation = permutation;
1333 dpsoftrast.shader_exactspecularmath = exactspecularmath;
1336 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1337 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1339 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1341 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1343 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1344 command->index = index;
1345 command->val[0] = v0;
1346 command->val[1] = v1;
1347 command->val[2] = v2;
1348 command->val[3] = v3;
1350 dpsoftrast.uniform4f[index*4+0] = v0;
1351 dpsoftrast.uniform4f[index*4+1] = v1;
1352 dpsoftrast.uniform4f[index*4+2] = v2;
1353 dpsoftrast.uniform4f[index*4+3] = v3;
1355 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1357 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1358 command->index = index;
1359 memcpy(command->val, v, sizeof(command->val));
1361 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1364 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1365 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1367 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1369 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1373 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1375 __m128 m0, m1, m2, m3;
1376 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1377 command->index = (DPSOFTRAST_UNIFORM)index;
1378 if (((size_t)v)&(ALIGN_SIZE-1))
1380 m0 = _mm_loadu_ps(v);
1381 m1 = _mm_loadu_ps(v+4);
1382 m2 = _mm_loadu_ps(v+8);
1383 m3 = _mm_loadu_ps(v+12);
1387 m0 = _mm_load_ps(v);
1388 m1 = _mm_load_ps(v+4);
1389 m2 = _mm_load_ps(v+8);
1390 m3 = _mm_load_ps(v+12);
1394 __m128 t0, t1, t2, t3;
1395 t0 = _mm_unpacklo_ps(m0, m1);
1396 t1 = _mm_unpacklo_ps(m2, m3);
1397 t2 = _mm_unpackhi_ps(m0, m1);
1398 t3 = _mm_unpackhi_ps(m2, m3);
1399 m0 = _mm_movelh_ps(t0, t1);
1400 m1 = _mm_movehl_ps(t1, t0);
1401 m2 = _mm_movelh_ps(t2, t3);
1402 m3 = _mm_movehl_ps(t3, t2);
1404 _mm_store_ps(command->val, m0);
1405 _mm_store_ps(command->val+4, m1);
1406 _mm_store_ps(command->val+8, m2);
1407 _mm_store_ps(command->val+12, m3);
1408 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1409 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1410 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1411 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1416 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1417 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1419 thread->uniform1i[command->index] = command->val;
1421 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1423 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1424 command->index = index;
1427 dpsoftrast.uniform1i[command->index] = i0;
1431 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1433 float *end = dst + size*4;
1434 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1438 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1447 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1454 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1456 float *end = dst + size*4;
1457 if (stride == sizeof(float[3]))
1459 float *end4 = dst + (size&~3)*4;
1460 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1464 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1465 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1466 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1467 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1468 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1469 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1470 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1471 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1472 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1473 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1474 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1475 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1476 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1478 src += 4*sizeof(float[3]);
1485 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1486 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1487 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1488 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1489 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1490 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1491 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1492 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1493 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1494 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1495 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1496 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1497 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1499 src += 4*sizeof(float[3]);
1503 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1507 __m128 v = _mm_loadu_ps((const float *)src);
1508 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1509 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1510 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1511 _mm_store_ps(dst, v);
1520 __m128 v = _mm_load_ps((const float *)src);
1521 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1522 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1523 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1524 _mm_store_ps(dst, v);
1531 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1533 float *end = dst + size*4;
1534 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1535 if (stride == sizeof(float[2]))
1537 float *end2 = dst + (size&~1)*4;
1538 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1542 __m128 v = _mm_loadu_ps((const float *)src);
1543 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1544 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1546 src += 2*sizeof(float[2]);
1553 __m128 v = _mm_load_ps((const float *)src);
1554 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1555 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1557 src += 2*sizeof(float[2]);
1563 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1569 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1571 float *end = dst + size*4;
1572 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1573 if (stride == sizeof(unsigned char[4]))
1575 float *end4 = dst + (size&~3)*4;
1576 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1580 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1581 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1582 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1583 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1584 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1586 src += 4*sizeof(unsigned char[4]);
1593 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1594 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1595 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1596 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1597 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1599 src += 4*sizeof(unsigned char[4]);
1605 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1606 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1612 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1614 float *end = dst + 4*size;
1615 __m128 v = _mm_loadu_ps(src);
1618 _mm_store_ps(dst, v);
1624 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1627 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1628 __m128 m0, m1, m2, m3;
1630 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1632 // fast case for identity matrix
1633 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1636 end = out4f + numitems*4;
1637 m0 = _mm_loadu_ps(inmatrix16f);
1638 m1 = _mm_loadu_ps(inmatrix16f + 4);
1639 m2 = _mm_loadu_ps(inmatrix16f + 8);
1640 m3 = _mm_loadu_ps(inmatrix16f + 12);
1641 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1645 __m128 v = _mm_loadu_ps(in4f);
1647 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1648 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1649 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1650 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1659 __m128 v = _mm_load_ps(in4f);
1661 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1662 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1663 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1664 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1672 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1674 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1678 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1680 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1681 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1682 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1683 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1686 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1688 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1689 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1690 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1691 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1694 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1697 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1698 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1699 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1700 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1703 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1705 int clipmask = 0xFF;
1706 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1707 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1708 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1709 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1710 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1711 #define BBFRONT(k, pos) \
1713 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1714 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1715 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1718 clipmask &= ~(1<<k); \
1719 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1720 minproj = _mm_min_ss(minproj, proj); \
1721 maxproj = _mm_max_ss(maxproj, proj); \
1725 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1726 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1727 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1728 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1729 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1730 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1734 if (clipmask&(1<<k)) \
1736 if (!(clipmask&(1<<(k^1)))) \
1738 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1739 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1740 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1741 minproj = _mm_min_ss(minproj, proj); \
1742 maxproj = _mm_max_ss(maxproj, proj); \
1744 if (!(clipmask&(1<<(k^2)))) \
1746 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1747 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1748 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1749 minproj = _mm_min_ss(minproj, proj); \
1750 maxproj = _mm_max_ss(maxproj, proj); \
1752 if (!(clipmask&(1<<(k^4)))) \
1754 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1755 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1756 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1757 minproj = _mm_min_ss(minproj, proj); \
1758 maxproj = _mm_max_ss(maxproj, proj); \
1762 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1763 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1764 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1765 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1766 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1767 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1768 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1769 *starty = _mm_cvttss_si32(maxproj);
1770 *endy = _mm_cvttss_si32(minproj)+1;
1774 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1776 float *end = out4f + numitems*4;
1777 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1778 __m128 minpos, maxpos;
1779 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1781 minpos = maxpos = _mm_loadu_ps(in4f);
1784 __m128 v = _mm_loadu_ps(in4f);
1785 minpos = _mm_min_ps(minpos, v);
1786 maxpos = _mm_max_ps(maxpos, v);
1787 _mm_store_ps(out4f, v);
1788 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1789 _mm_store_ps(screen4f, v);
1797 minpos = maxpos = _mm_load_ps(in4f);
1800 __m128 v = _mm_load_ps(in4f);
1801 minpos = _mm_min_ps(minpos, v);
1802 maxpos = _mm_max_ps(maxpos, v);
1803 _mm_store_ps(out4f, v);
1804 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1805 _mm_store_ps(screen4f, v);
1812 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale,
1813 _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1814 _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1815 _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1816 _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1820 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1822 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1823 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1825 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1826 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1827 end = out4f + numitems*4;
1828 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1829 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1830 m0 = _mm_loadu_ps(inmatrix16f);
1831 m1 = _mm_loadu_ps(inmatrix16f + 4);
1832 m2 = _mm_loadu_ps(inmatrix16f + 8);
1833 m3 = _mm_loadu_ps(inmatrix16f + 12);
1834 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1836 minpos = maxpos = _mm_loadu_ps(in4f);
1839 __m128 v = _mm_loadu_ps(in4f);
1840 minpos = _mm_min_ps(minpos, v);
1841 maxpos = _mm_max_ps(maxpos, v);
1842 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1843 _mm_store_ps(out4f, v);
1844 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1845 _mm_store_ps(screen4f, v);
1853 minpos = maxpos = _mm_load_ps(in4f);
1856 __m128 v = _mm_load_ps(in4f);
1857 minpos = _mm_min_ps(minpos, v);
1858 maxpos = _mm_max_ps(maxpos, v);
1859 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1860 _mm_store_ps(out4f, v);
1861 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1862 _mm_store_ps(screen4f, v);
1869 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3);
1874 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1877 float *outf = dpsoftrast.post_array4f[outarray];
1878 const unsigned char *inb;
1879 int firstvertex = dpsoftrast.firstvertex;
1880 int numvertices = dpsoftrast.numvertices;
1884 case DPSOFTRAST_ARRAY_POSITION:
1885 stride = dpsoftrast.stride_vertex;
1886 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1887 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1889 case DPSOFTRAST_ARRAY_COLOR:
1890 stride = dpsoftrast.stride_color;
1891 if (dpsoftrast.pointer_color4f)
1893 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1894 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1896 else if (dpsoftrast.pointer_color4ub)
1898 stride = dpsoftrast.stride_color;
1899 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1900 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1904 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1908 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1909 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1911 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1912 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1915 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1918 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1921 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1933 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1935 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1936 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1941 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1944 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1945 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1953 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1956 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1957 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1964 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1967 int startx = span->startx;
1968 int endx = span->endx;
1969 float wslope = triangle->w[0];
1970 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1971 float endz = 1.0f / (w + wslope * startx);
1972 for (x = startx;x < endx;)
1974 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1976 if (nextsub >= endx) nextsub = endsub = endx-1;
1977 endz = 1.0f / (w + wslope * nextsub);
1978 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1979 for (; x <= endsub; x++, z += dz)
1984 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1987 int startx = span->startx;
1988 int endx = span->endx;
1991 unsigned char * RESTRICT pixelmask = span->pixelmask;
1992 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1995 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1996 // handle alphatest now (this affects depth writes too)
1997 if (thread->alphatest)
1998 for (x = startx;x < endx;x++)
1999 if (in4f[x*4+3] < 0.5f)
2000 pixelmask[x] = false;
2001 // FIXME: this does not handle bigendian
2002 switch(thread->fb_blendmode)
2004 case DPSOFTRAST_BLENDMODE_OPAQUE:
2005 for (x = startx;x < endx;x++)
2009 d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
2010 d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
2011 d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
2012 d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
2013 pixel[x*4+0] = d[0];
2014 pixel[x*4+1] = d[1];
2015 pixel[x*4+2] = d[2];
2016 pixel[x*4+3] = d[3];
2019 case DPSOFTRAST_BLENDMODE_ALPHA:
2020 for (x = startx;x < endx;x++)
2024 a = in4f[x*4+3] * 255.0f;
2025 b = 1.0f - in4f[x*4+3];
2026 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2027 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2028 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2029 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2030 pixel[x*4+0] = d[0];
2031 pixel[x*4+1] = d[1];
2032 pixel[x*4+2] = d[2];
2033 pixel[x*4+3] = d[3];
2036 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2037 for (x = startx;x < endx;x++)
2041 a = in4f[x*4+3] * 255.0f;
2042 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2043 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2044 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2045 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2046 pixel[x*4+0] = d[0];
2047 pixel[x*4+1] = d[1];
2048 pixel[x*4+2] = d[2];
2049 pixel[x*4+3] = d[3];
2052 case DPSOFTRAST_BLENDMODE_ADD:
2053 for (x = startx;x < endx;x++)
2057 d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2058 d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2059 d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2060 d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2061 pixel[x*4+0] = d[0];
2062 pixel[x*4+1] = d[1];
2063 pixel[x*4+2] = d[2];
2064 pixel[x*4+3] = d[3];
2067 case DPSOFTRAST_BLENDMODE_INVMOD:
2068 for (x = startx;x < endx;x++)
2072 d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2073 d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2074 d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2075 d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2076 pixel[x*4+0] = d[0];
2077 pixel[x*4+1] = d[1];
2078 pixel[x*4+2] = d[2];
2079 pixel[x*4+3] = d[3];
2082 case DPSOFTRAST_BLENDMODE_MUL:
2083 for (x = startx;x < endx;x++)
2087 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2088 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2089 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2090 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2091 pixel[x*4+0] = d[0];
2092 pixel[x*4+1] = d[1];
2093 pixel[x*4+2] = d[2];
2094 pixel[x*4+3] = d[3];
2097 case DPSOFTRAST_BLENDMODE_MUL2:
2098 for (x = startx;x < endx;x++)
2102 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2103 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2104 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2105 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2106 pixel[x*4+0] = d[0];
2107 pixel[x*4+1] = d[1];
2108 pixel[x*4+2] = d[2];
2109 pixel[x*4+3] = d[3];
2112 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2113 for (x = startx;x < endx;x++)
2117 a = in4f[x*4+3] * -255.0f;
2118 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2119 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2120 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2121 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2122 pixel[x*4+0] = d[0];
2123 pixel[x*4+1] = d[1];
2124 pixel[x*4+2] = d[2];
2125 pixel[x*4+3] = d[3];
2128 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2129 for (x = startx;x < endx;x++)
2134 b = 1.0f - in4f[x*4+3];
2135 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2136 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2137 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2138 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2139 pixel[x*4+0] = d[0];
2140 pixel[x*4+1] = d[1];
2141 pixel[x*4+2] = d[2];
2142 pixel[x*4+3] = d[3];
2145 case DPSOFTRAST_BLENDMODE_INVADD:
2146 for (x = startx;x < endx;x++)
2150 d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2151 d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2152 d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2153 d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2154 pixel[x*4+0] = d[0];
2155 pixel[x*4+1] = d[1];
2156 pixel[x*4+2] = d[2];
2157 pixel[x*4+3] = d[3];
2163 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2167 int startx = span->startx;
2168 int endx = span->endx;
2169 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2170 unsigned char * RESTRICT pixelmask = span->pixelmask;
2171 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2172 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2175 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2176 pixeli += span->y * dpsoftrast.fb_width + span->x;
2177 // handle alphatest now (this affects depth writes too)
2178 if (thread->alphatest)
2179 for (x = startx;x < endx;x++)
2180 if (in4ub[x*4+3] < 0.5f)
2181 pixelmask[x] = false;
2182 // FIXME: this does not handle bigendian
2183 switch(thread->fb_blendmode)
2185 case DPSOFTRAST_BLENDMODE_OPAQUE:
2186 for (x = startx;x + 4 <= endx;)
2188 if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2190 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2204 case DPSOFTRAST_BLENDMODE_ALPHA:
2205 #define FINISHBLEND(blend2, blend1) \
2206 for (x = startx;x + 1 < endx;x += 2) \
2209 switch (*(const unsigned short*)&pixelmask[x]) \
2212 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2213 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2215 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2218 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2219 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2221 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2224 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2225 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2227 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2232 for(;x < endx; x++) \
2235 if (!pixelmask[x]) \
2237 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2238 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2240 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2244 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2245 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2247 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2248 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2251 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2253 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2254 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2256 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2257 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2260 case DPSOFTRAST_BLENDMODE_ADD:
2261 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2263 case DPSOFTRAST_BLENDMODE_INVMOD:
2265 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2267 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2270 case DPSOFTRAST_BLENDMODE_MUL:
2271 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2273 case DPSOFTRAST_BLENDMODE_MUL2:
2274 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2276 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2278 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2279 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2281 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2282 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2285 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2287 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2288 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2290 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2291 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2294 case DPSOFTRAST_BLENDMODE_INVADD:
2296 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2298 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2305 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2308 int startx = span->startx;
2309 int endx = span->endx;
2314 float tc[2], endtc[2];
2316 unsigned int tci[2];
2317 unsigned int tci1[2];
2318 unsigned int tcimin[2];
2319 unsigned int tcimax[2];
2324 const unsigned char * RESTRICT pixelbase;
2325 const unsigned char * RESTRICT pixel[4];
2326 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2327 // if no texture is bound, just fill it with white
2330 for (x = startx;x < endx;x++)
2332 out4f[x*4+0] = 1.0f;
2333 out4f[x*4+1] = 1.0f;
2334 out4f[x*4+2] = 1.0f;
2335 out4f[x*4+3] = 1.0f;
2339 mip = triangle->mip[texunitindex];
2340 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2341 // if this mipmap of the texture is 1 pixel, just fill it with that color
2342 if (texture->mipmap[mip][1] == 4)
2344 c[0] = texture->bytes[2] * (1.0f/255.0f);
2345 c[1] = texture->bytes[1] * (1.0f/255.0f);
2346 c[2] = texture->bytes[0] * (1.0f/255.0f);
2347 c[3] = texture->bytes[3] * (1.0f/255.0f);
2348 for (x = startx;x < endx;x++)
2350 out4f[x*4+0] = c[0];
2351 out4f[x*4+1] = c[1];
2352 out4f[x*4+2] = c[2];
2353 out4f[x*4+3] = c[3];
2357 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2358 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2359 flags = texture->flags;
2360 tcscale[0] = texture->mipmap[mip][2];
2361 tcscale[1] = texture->mipmap[mip][3];
2362 tciwidth = texture->mipmap[mip][2];
2365 tcimax[0] = texture->mipmap[mip][2]-1;
2366 tcimax[1] = texture->mipmap[mip][3]-1;
2367 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2368 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2369 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2370 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2371 for (x = startx;x < endx;)
2373 unsigned int subtc[2];
2374 unsigned int substep[2];
2375 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2376 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2377 if (nextsub >= endx)
2379 nextsub = endsub = endx-1;
2380 if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2384 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2385 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2386 substep[0] = (endtc[0] - tc[0]) * subscale;
2387 substep[1] = (endtc[1] - tc[1]) * subscale;
2388 subtc[0] = tc[0] * (1<<16);
2389 subtc[1] = tc[1] * (1<<16);
2392 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2394 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2396 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2397 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2398 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2399 tci[0] = subtc[0]>>16;
2400 tci[1] = subtc[1]>>16;
2401 tci1[0] = tci[0] + 1;
2402 tci1[1] = tci[1] + 1;
2403 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2404 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2405 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2406 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2407 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2408 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2409 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2410 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2411 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2412 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2413 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2414 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2415 out4f[x*4+0] = c[0];
2416 out4f[x*4+1] = c[1];
2417 out4f[x*4+2] = c[2];
2418 out4f[x*4+3] = c[3];
2423 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2425 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2426 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2427 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2428 tci[0] = subtc[0]>>16;
2429 tci[1] = subtc[1]>>16;
2430 tci1[0] = tci[0] + 1;
2431 tci1[1] = tci[1] + 1;
2432 tci[0] &= tciwrapmask[0];
2433 tci[1] &= tciwrapmask[1];
2434 tci1[0] &= tciwrapmask[0];
2435 tci1[1] &= tciwrapmask[1];
2436 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2437 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2438 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2439 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2440 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2441 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2442 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2443 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2444 out4f[x*4+0] = c[0];
2445 out4f[x*4+1] = c[1];
2446 out4f[x*4+2] = c[2];
2447 out4f[x*4+3] = c[3];
2451 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2453 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2455 tci[0] = subtc[0]>>16;
2456 tci[1] = subtc[1]>>16;
2457 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2458 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2459 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2460 c[0] = pixel[0][2] * (1.0f / 255.0f);
2461 c[1] = pixel[0][1] * (1.0f / 255.0f);
2462 c[2] = pixel[0][0] * (1.0f / 255.0f);
2463 c[3] = pixel[0][3] * (1.0f / 255.0f);
2464 out4f[x*4+0] = c[0];
2465 out4f[x*4+1] = c[1];
2466 out4f[x*4+2] = c[2];
2467 out4f[x*4+3] = c[3];
2472 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2474 tci[0] = subtc[0]>>16;
2475 tci[1] = subtc[1]>>16;
2476 tci[0] &= tciwrapmask[0];
2477 tci[1] &= tciwrapmask[1];
2478 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2479 c[0] = pixel[0][2] * (1.0f / 255.0f);
2480 c[1] = pixel[0][1] * (1.0f / 255.0f);
2481 c[2] = pixel[0][0] * (1.0f / 255.0f);
2482 c[3] = pixel[0][3] * (1.0f / 255.0f);
2483 out4f[x*4+0] = c[0];
2484 out4f[x*4+1] = c[1];
2485 out4f[x*4+2] = c[2];
2486 out4f[x*4+3] = c[3];
2492 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2496 int startx = span->startx;
2497 int endx = span->endx;
2499 __m128 data, slope, tcscale;
2500 __m128i tcsize, tcmask, tcoffset, tcmax;
2502 __m128i subtc, substep, endsubtc;
2505 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2506 const unsigned char * RESTRICT pixelbase;
2507 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2508 // if no texture is bound, just fill it with white
2511 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2514 mip = triangle->mip[texunitindex];
2515 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2516 // if this mipmap of the texture is 1 pixel, just fill it with that color
2517 if (texture->mipmap[mip][1] == 4)
2519 unsigned int k = *((const unsigned int *)pixelbase);
2520 for (x = startx;x < endx;x++)
2524 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2525 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2526 flags = texture->flags;
2527 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2528 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2529 tcscale = _mm_cvtepi32_ps(tcsize);
2530 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2531 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2532 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2533 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2534 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2535 tcmax = _mm_packs_epi32(tcmask, tcmask);
2536 for (x = startx;x < endx;)
2538 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2539 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2540 if (nextsub >= endx)
2542 nextsub = endsub = endx-1;
2543 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2547 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2548 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2549 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2550 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2551 substep = _mm_slli_epi32(substep, 1);
2554 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2555 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2557 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2558 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2560 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2561 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2562 tci = _mm_madd_epi16(tci, tcoffset);
2563 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2564 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2565 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2566 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2567 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2568 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2569 fracm = _mm_srli_epi16(subtc, 1);
2570 pix1 = _mm_add_epi16(pix1,
2571 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2572 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2573 pix3 = _mm_add_epi16(pix3,
2574 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2575 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2576 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2577 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2578 pix2 = _mm_add_epi16(pix2,
2579 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2580 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2581 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2585 const unsigned char * RESTRICT ptr1;
2586 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2587 tci = _mm_madd_epi16(tci, tcoffset);
2588 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2589 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2590 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2591 fracm = _mm_srli_epi16(subtc, 1);
2592 pix1 = _mm_add_epi16(pix1,
2593 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2594 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2595 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2596 pix1 = _mm_add_epi16(pix1,
2597 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2598 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2599 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2603 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2605 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2607 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2608 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2609 tci = _mm_madd_epi16(tci, tcoffset);
2610 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2611 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2612 _mm_setzero_si128());
2613 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2614 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2615 _mm_setzero_si128());
2616 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2617 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2618 tci = _mm_madd_epi16(tci, tcoffset);
2619 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2620 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2621 _mm_setzero_si128());
2622 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2623 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2624 _mm_setzero_si128());
2625 fracm = _mm_srli_epi16(subtc, 1);
2626 pix1 = _mm_add_epi16(pix1,
2627 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2628 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2629 pix3 = _mm_add_epi16(pix3,
2630 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2631 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2632 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2633 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2634 pix2 = _mm_add_epi16(pix2,
2635 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2636 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2637 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2641 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2642 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2643 tci = _mm_madd_epi16(tci, tcoffset);
2644 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2645 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2646 _mm_setzero_si128());
2647 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2648 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2649 _mm_setzero_si128());
2650 fracm = _mm_srli_epi16(subtc, 1);
2651 pix1 = _mm_add_epi16(pix1,
2652 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2653 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2654 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2655 pix1 = _mm_add_epi16(pix1,
2656 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2657 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2658 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2664 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2666 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2667 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2668 tci = _mm_madd_epi16(tci, tcoffset);
2669 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2670 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2671 _mm_setzero_si128());
2672 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2673 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2674 _mm_setzero_si128());
2675 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2676 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2677 tci = _mm_madd_epi16(tci, tcoffset);
2678 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2679 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2680 _mm_setzero_si128());
2681 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2682 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2683 _mm_setzero_si128());
2684 fracm = _mm_srli_epi16(subtc, 1);
2685 pix1 = _mm_add_epi16(pix1,
2686 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2687 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2688 pix3 = _mm_add_epi16(pix3,
2689 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2690 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2691 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2692 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2693 pix2 = _mm_add_epi16(pix2,
2694 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2695 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2696 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2700 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2701 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2702 tci = _mm_madd_epi16(tci, tcoffset);
2703 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2704 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2705 _mm_setzero_si128());
2706 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2707 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2708 _mm_setzero_si128());
2709 fracm = _mm_srli_epi16(subtc, 1);
2710 pix1 = _mm_add_epi16(pix1,
2711 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2712 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2713 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2714 pix1 = _mm_add_epi16(pix1,
2715 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2716 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2717 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2724 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2726 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2728 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2729 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2730 tci = _mm_madd_epi16(tci, tcoffset);
2731 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2732 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2736 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2737 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2738 tci = _mm_madd_epi16(tci, tcoffset);
2739 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2745 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2747 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2748 tci = _mm_and_si128(tci, tcmax);
2749 tci = _mm_madd_epi16(tci, tcoffset);
2750 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2751 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2755 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2756 tci = _mm_and_si128(tci, tcmax);
2757 tci = _mm_madd_epi16(tci, tcoffset);
2758 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2767 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2770 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2773 float DPSOFTRAST_SampleShadowmap(const float *vector)
2779 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2782 int startx = span->startx;
2783 int endx = span->endx;
2788 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2789 for (x = startx;x < endx;x++)
2792 c[0] = (data[0] + slope[0]*x) * z;
2793 c[1] = (data[1] + slope[1]*x) * z;
2794 c[2] = (data[2] + slope[2]*x) * z;
2795 c[3] = (data[3] + slope[3]*x) * z;
2796 out4f[x*4+0] = in4f[x*4+0] * c[0];
2797 out4f[x*4+1] = in4f[x*4+1] * c[1];
2798 out4f[x*4+2] = in4f[x*4+2] * c[2];
2799 out4f[x*4+3] = in4f[x*4+3] * c[3];
2803 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2806 int startx = span->startx;
2807 int endx = span->endx;
2812 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2813 for (x = startx;x < endx;x++)
2816 c[0] = (data[0] + slope[0]*x) * z;
2817 c[1] = (data[1] + slope[1]*x) * z;
2818 c[2] = (data[2] + slope[2]*x) * z;
2819 c[3] = (data[3] + slope[3]*x) * z;
2820 out4f[x*4+0] = c[0];
2821 out4f[x*4+1] = c[1];
2822 out4f[x*4+2] = c[2];
2823 out4f[x*4+3] = c[3];
2827 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2829 int x, startx = span->startx, endx = span->endx;
2830 float c[4], localcolor[4];
2831 localcolor[0] = subcolor[0];
2832 localcolor[1] = subcolor[1];
2833 localcolor[2] = subcolor[2];
2834 localcolor[3] = subcolor[3];
2835 for (x = startx;x < endx;x++)
2837 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2838 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2839 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2840 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2841 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2842 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2843 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2844 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2848 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2850 int x, startx = span->startx, endx = span->endx;
2851 for (x = startx;x < endx;x++)
2853 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2854 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2855 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2856 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2860 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2862 int x, startx = span->startx, endx = span->endx;
2863 for (x = startx;x < endx;x++)
2865 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2866 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2867 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2868 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2872 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2874 int x, startx = span->startx, endx = span->endx;
2876 for (x = startx;x < endx;x++)
2878 a = 1.0f - inb4f[x*4+3];
2880 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2881 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2882 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2883 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2887 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2889 int x, startx = span->startx, endx = span->endx;
2890 float localcolor[4], ilerp, lerp;
2891 localcolor[0] = color[0];
2892 localcolor[1] = color[1];
2893 localcolor[2] = color[2];
2894 localcolor[3] = color[3];
2895 ilerp = 1.0f - localcolor[3];
2896 lerp = localcolor[3];
2897 for (x = startx;x < endx;x++)
2899 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2900 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2901 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2902 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2908 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2912 int startx = span->startx;
2913 int endx = span->endx;
2916 __m128i submod, substep, endsubmod;
2917 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2918 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2919 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2920 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2921 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2922 for (x = startx; x < endx;)
2924 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2925 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2926 if (nextsub >= endx)
2928 nextsub = endsub = endx-1;
2929 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2933 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2934 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2935 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2936 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2937 substep = _mm_packs_epi32(substep, substep);
2938 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2940 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2941 pix = _mm_mulhi_epu16(pix, submod);
2942 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2946 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2947 pix = _mm_mulhi_epu16(pix, submod);
2948 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2955 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2959 int startx = span->startx;
2960 int endx = span->endx;
2963 __m128i submod, substep, endsubmod;
2964 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2965 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2966 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2967 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2968 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2969 for (x = startx; x < endx;)
2971 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2972 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2973 if (nextsub >= endx)
2975 nextsub = endsub = endx-1;
2976 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2980 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2981 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2982 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2983 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2984 substep = _mm_packs_epi32(substep, substep);
2985 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2987 __m128i pix = _mm_srai_epi16(submod, 4);
2988 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2992 __m128i pix = _mm_srai_epi16(submod, 4);
2993 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3000 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3003 int x, startx = span->startx, endx = span->endx;
3004 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3005 localcolor = _mm_packs_epi32(localcolor, localcolor);
3006 for (x = startx;x+2 <= endx;x+=2)
3008 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3009 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3010 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3011 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3015 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3016 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3017 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3018 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3023 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3026 int x, startx = span->startx, endx = span->endx;
3027 for (x = startx;x+2 <= endx;x+=2)
3029 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3030 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3031 pix1 = _mm_mulhi_epu16(pix1, pix2);
3032 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3036 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3037 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3038 pix1 = _mm_mulhi_epu16(pix1, pix2);
3039 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3044 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3047 int x, startx = span->startx, endx = span->endx;
3048 for (x = startx;x+2 <= endx;x+=2)
3050 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3051 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3052 pix1 = _mm_add_epi16(pix1, pix2);
3053 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3057 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3058 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3059 pix1 = _mm_add_epi16(pix1, pix2);
3060 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3065 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3068 int x, startx = span->startx, endx = span->endx;
3069 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3070 tint = _mm_packs_epi32(tint, tint);
3071 for (x = startx;x+2 <= endx;x+=2)
3073 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3074 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3075 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3076 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3080 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3081 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3082 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3083 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3088 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3091 int x, startx = span->startx, endx = span->endx;
3092 for (x = startx;x+2 <= endx;x+=2)
3094 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3095 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3096 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3097 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3098 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3102 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3103 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3104 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3105 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3106 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3111 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3114 int x, startx = span->startx, endx = span->endx;
3115 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3116 localcolor = _mm_packs_epi32(localcolor, localcolor);
3117 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3118 for (x = startx;x+2 <= endx;x+=2)
3120 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3121 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3122 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3126 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3127 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3128 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3135 void DPSOFTRAST_VertexShader_Generic(void)
3137 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3138 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3139 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3140 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3141 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3144 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3146 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3147 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3148 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3149 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3150 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3151 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3153 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3154 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3155 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3157 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3158 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3161 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3163 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3166 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3168 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3171 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3176 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3177 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3182 void DPSOFTRAST_VertexShader_PostProcess(void)
3184 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3185 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3186 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3189 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3191 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3192 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3193 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3194 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3195 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3196 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3197 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3199 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3200 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3202 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3203 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3205 // TODO: implement saturation
3207 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3209 // TODO: implement gammaramps
3211 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3216 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3218 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3221 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3223 // this is never called (because colormask is off when this shader is used)
3224 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3225 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3226 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3227 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3228 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3233 void DPSOFTRAST_VertexShader_FlatColor(void)
3235 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3236 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3239 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3242 unsigned char * RESTRICT pixelmask = span->pixelmask;
3243 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3244 int x, startx = span->startx, endx = span->endx;
3245 __m128i Color_Ambientm;
3246 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3247 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3248 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3249 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3250 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3251 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3252 pixel = buffer_FragColorbgra8;
3253 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3254 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3255 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3256 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3257 for (x = startx;x < endx;x++)
3260 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3263 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3264 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3265 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3266 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3272 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3273 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3274 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3276 if (pixel == buffer_FragColorbgra8)
3277 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3283 void DPSOFTRAST_VertexShader_VertexColor(void)
3285 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3286 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3287 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3290 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3293 unsigned char * RESTRICT pixelmask = span->pixelmask;
3294 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3295 int x, startx = span->startx, endx = span->endx;
3296 __m128i Color_Ambientm, Color_Diffusem;
3298 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3299 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3300 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3301 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3302 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3303 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3304 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3305 pixel = buffer_FragColorbgra8;
3306 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3307 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3308 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3309 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3310 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3311 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3312 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3313 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3314 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3315 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3316 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3317 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3318 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3319 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3321 __m128i color, mod, pix;
3322 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3325 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3326 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3327 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3328 data = _mm_add_ps(data, slope);
3329 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3330 data = _mm_add_ps(data, slope);
3331 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3332 data = _mm_add_ps(data, slope);
3333 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3334 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3335 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3336 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3337 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3338 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3344 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3345 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3346 mod = _mm_packs_epi32(mod, mod);
3347 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3348 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3350 if (pixel == buffer_FragColorbgra8)
3351 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3357 void DPSOFTRAST_VertexShader_Lightmap(void)
3359 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3360 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3361 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3364 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3367 unsigned char * RESTRICT pixelmask = span->pixelmask;
3368 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3369 int x, startx = span->startx, endx = span->endx;
3370 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3371 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3372 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3373 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3374 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3375 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3376 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3377 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3378 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3379 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3380 pixel = buffer_FragColorbgra8;
3381 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3382 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3383 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3384 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3385 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3386 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3387 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3388 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3390 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3391 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3392 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3393 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3394 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3395 for (x = startx;x < endx;x++)
3397 __m128i color, lightmap, glow, pix;
3398 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3401 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3402 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3403 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3404 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3405 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3406 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3407 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3408 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3409 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3410 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3416 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3417 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3418 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3419 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3420 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3421 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3426 for (x = startx;x < endx;x++)
3428 __m128i color, lightmap, pix;
3429 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3432 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3433 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3434 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3435 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3436 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3437 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3438 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3444 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3445 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3446 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3447 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3450 if (pixel == buffer_FragColorbgra8)
3451 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3456 void DPSOFTRAST_VertexShader_LightDirection(void);
3457 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3459 void DPSOFTRAST_VertexShader_FakeLight(void)
3461 DPSOFTRAST_VertexShader_LightDirection();
3464 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3466 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3471 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3473 DPSOFTRAST_VertexShader_LightDirection();
3474 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3477 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3479 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3484 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3486 DPSOFTRAST_VertexShader_LightDirection();
3487 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3490 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3492 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3497 void DPSOFTRAST_VertexShader_LightDirection(void)
3500 int numvertices = dpsoftrast.numvertices;
3502 float LightVector[4];
3503 float EyePosition[4];
3504 float EyeVectorModelSpace[4];
3510 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3511 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3512 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3513 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3514 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3515 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3516 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3517 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3518 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3519 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3520 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3521 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3522 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3523 for (i = 0;i < numvertices;i++)
3525 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3526 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3527 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3528 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3529 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3530 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3531 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3532 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3533 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3534 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3535 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3536 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3537 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3538 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3539 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3540 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3541 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3542 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3543 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3544 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3545 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3546 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3547 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3548 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3549 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3550 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3551 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3552 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3553 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3555 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3558 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3559 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3560 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3561 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3562 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3563 #define DPSOFTRAST_Vector3Normalize(v)\
3566 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3577 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3579 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3580 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3581 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3582 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3583 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3584 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3585 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3586 unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3587 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3588 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3589 int x, startx = span->startx, endx = span->endx;
3590 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3591 float LightVectordata[4];
3592 float LightVectorslope[4];
3593 float EyeVectordata[4];
3594 float EyeVectorslope[4];
3595 float VectorSdata[4];
3596 float VectorSslope[4];
3597 float VectorTdata[4];
3598 float VectorTslope[4];
3599 float VectorRdata[4];
3600 float VectorRslope[4];
3602 float diffusetex[4];
3604 float surfacenormal[4];
3605 float lightnormal[4];
3606 float lightnormal_modelspace[4];
3608 float specularnormal[4];
3611 float SpecularPower;
3613 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3614 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3615 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3616 Color_Glow[3] = 0.0f;
3617 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3618 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3619 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3620 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3621 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3622 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3623 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3624 Color_Pants[3] = 0.0f;
3625 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3626 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3627 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3628 Color_Shirt[3] = 0.0f;
3629 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3630 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3631 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3633 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3634 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3636 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3638 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3640 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3642 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3643 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3644 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3645 Color_Diffuse[3] = 0.0f;
3646 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3647 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3648 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3649 LightColor[3] = 0.0f;
3650 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3651 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3652 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3653 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3654 Color_Specular[3] = 0.0f;
3655 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3656 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3657 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3659 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3661 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3662 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3663 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3664 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3665 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3667 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3669 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3670 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3672 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3674 // nothing of this needed
3678 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3681 for (x = startx;x < endx;x++)
3684 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3685 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3686 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3687 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3688 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3690 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3691 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3692 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3693 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3695 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3696 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3697 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3698 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3699 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3700 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3701 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3702 DPSOFTRAST_Vector3Normalize(surfacenormal);
3704 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3706 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3707 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3708 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3709 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3711 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3712 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3713 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3714 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3716 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3717 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3718 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3719 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3721 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3722 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3723 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3724 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3726 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3727 DPSOFTRAST_Vector3Normalize(lightnormal);
3729 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3731 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3732 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3733 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3734 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3737 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3739 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3740 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3741 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3743 float f = 1.0f / 256.0f;
3744 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3745 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3746 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3749 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3751 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3752 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3753 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3754 DPSOFTRAST_Vector3Normalize(lightnormal);
3756 LightColor[0] = 1.0;
3757 LightColor[1] = 1.0;
3758 LightColor[2] = 1.0;
3762 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3763 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3764 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3765 DPSOFTRAST_Vector3Normalize(lightnormal);
3768 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3770 if(thread->shader_exactspecularmath)
3772 // reflect lightnormal at surfacenormal, take the negative of that
3773 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3775 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3776 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3777 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3778 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3780 // dot of this and normalize(EyeVectorFogDepth.xyz)
3781 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3782 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3783 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3784 DPSOFTRAST_Vector3Normalize(eyenormal);
3786 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3790 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3791 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3792 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3793 DPSOFTRAST_Vector3Normalize(eyenormal);
3795 specularnormal[0] = lightnormal[0] + eyenormal[0];
3796 specularnormal[1] = lightnormal[1] + eyenormal[1];
3797 specularnormal[2] = lightnormal[2] + eyenormal[2];
3798 DPSOFTRAST_Vector3Normalize(specularnormal);
3800 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3803 specular = pow(specular, SpecularPower * glosstex[3]);
3804 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3806 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3807 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3808 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3809 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3813 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3814 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3815 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3816 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3819 buffer_FragColorbgra8[x*4+0] = d[0];
3820 buffer_FragColorbgra8[x*4+1] = d[1];
3821 buffer_FragColorbgra8[x*4+2] = d[2];
3822 buffer_FragColorbgra8[x*4+3] = d[3];
3825 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3827 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3828 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3829 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3830 Color_Diffuse[3] = 0.0f;
3831 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3832 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3833 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3834 LightColor[3] = 0.0f;
3835 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3837 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3839 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3840 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3841 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3842 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3843 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3845 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3847 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3848 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3850 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3852 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3856 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3859 for (x = startx;x < endx;x++)
3862 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3863 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3864 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3865 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3866 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3867 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3868 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3869 DPSOFTRAST_Vector3Normalize(surfacenormal);
3871 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3873 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3874 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3875 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3876 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3878 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3879 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3880 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3881 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3883 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3884 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3885 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3886 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3888 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3889 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3890 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3891 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3893 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3894 DPSOFTRAST_Vector3Normalize(lightnormal);
3896 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3898 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3899 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3900 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3901 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3904 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3906 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3907 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3908 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3910 float f = 1.0f / 256.0f;
3911 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3912 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3913 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3916 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3918 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3919 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3920 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3921 DPSOFTRAST_Vector3Normalize(lightnormal);
3923 LightColor[0] = 1.0;
3924 LightColor[1] = 1.0;
3925 LightColor[2] = 1.0;
3929 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3930 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3931 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3932 DPSOFTRAST_Vector3Normalize(lightnormal);
3935 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3936 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3938 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3939 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3940 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3941 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3945 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3946 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3947 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3948 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3950 buffer_FragColorbgra8[x*4+0] = d[0];
3951 buffer_FragColorbgra8[x*4+1] = d[1];
3952 buffer_FragColorbgra8[x*4+2] = d[2];
3953 buffer_FragColorbgra8[x*4+3] = d[3];
3958 for (x = startx;x < endx;x++)
3961 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3962 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3963 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3964 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3966 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3968 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3969 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3970 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3971 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3975 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3976 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3977 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3978 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3980 buffer_FragColorbgra8[x*4+0] = d[0];
3981 buffer_FragColorbgra8[x*4+1] = d[1];
3982 buffer_FragColorbgra8[x*4+2] = d[2];
3983 buffer_FragColorbgra8[x*4+3] = d[3];
3986 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3991 void DPSOFTRAST_VertexShader_LightSource(void)
3994 int numvertices = dpsoftrast.numvertices;
3995 float LightPosition[4];
3996 float LightVector[4];
3997 float LightVectorModelSpace[4];
3998 float EyePosition[4];
3999 float EyeVectorModelSpace[4];
4005 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4006 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4007 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4008 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4009 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4010 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4011 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4012 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4013 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4014 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4015 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4016 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4017 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4018 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4019 for (i = 0;i < numvertices;i++)
4021 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4022 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4023 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4024 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4025 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4026 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4027 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4028 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4029 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4030 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4031 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4032 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4033 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4034 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4035 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4036 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4037 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4038 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
4039 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4040 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4041 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4042 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4043 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4044 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4045 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4046 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4047 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4048 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4049 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4050 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4051 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4052 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4054 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4055 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4058 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4061 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4062 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4063 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4064 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4065 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4066 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4067 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4068 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4069 int x, startx = span->startx, endx = span->endx;
4070 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4071 float CubeVectordata[4];
4072 float CubeVectorslope[4];
4073 float LightVectordata[4];
4074 float LightVectorslope[4];
4075 float EyeVectordata[4];
4076 float EyeVectorslope[4];
4078 float diffusetex[4];
4080 float surfacenormal[4];
4081 float lightnormal[4];
4083 float specularnormal[4];
4086 float SpecularPower;
4087 float CubeVector[4];
4090 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4091 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4092 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4093 Color_Glow[3] = 0.0f;
4094 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4095 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4096 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4097 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4098 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4099 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4100 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4101 Color_Diffuse[3] = 0.0f;
4102 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4103 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4104 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4105 Color_Specular[3] = 0.0f;
4106 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4107 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4108 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4109 Color_Pants[3] = 0.0f;
4110 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4111 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4112 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4113 Color_Shirt[3] = 0.0f;
4114 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4115 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4116 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4117 LightColor[3] = 0.0f;
4118 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4119 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4120 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4121 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4122 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4123 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4124 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4125 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4127 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4128 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4130 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4131 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4132 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4134 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4135 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4136 for (x = startx;x < endx;x++)
4139 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4140 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4141 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4142 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4143 if (attenuation < 0.01f)
4145 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4147 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4148 if (attenuation < 0.01f)
4152 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4153 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4154 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4155 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4156 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4158 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4159 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4160 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4161 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4163 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4164 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4165 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4166 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4167 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4168 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4169 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4170 DPSOFTRAST_Vector3Normalize(surfacenormal);
4172 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4173 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4174 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4175 DPSOFTRAST_Vector3Normalize(lightnormal);
4177 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4179 if(thread->shader_exactspecularmath)
4181 // reflect lightnormal at surfacenormal, take the negative of that
4182 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4184 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4185 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4186 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4187 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4189 // dot of this and normalize(EyeVectorFogDepth.xyz)
4190 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4191 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4192 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4193 DPSOFTRAST_Vector3Normalize(eyenormal);
4195 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4199 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4200 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4201 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4202 DPSOFTRAST_Vector3Normalize(eyenormal);
4204 specularnormal[0] = lightnormal[0] + eyenormal[0];
4205 specularnormal[1] = lightnormal[1] + eyenormal[1];
4206 specularnormal[2] = lightnormal[2] + eyenormal[2];
4207 DPSOFTRAST_Vector3Normalize(specularnormal);
4209 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4211 specular = pow(specular, SpecularPower * glosstex[3]);
4213 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4215 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4216 attenuation *= (1.0f / 255.0f);
4217 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4218 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4219 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4220 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4224 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4225 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4226 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4227 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4229 buffer_FragColorbgra8[x*4+0] = d[0];
4230 buffer_FragColorbgra8[x*4+1] = d[1];
4231 buffer_FragColorbgra8[x*4+2] = d[2];
4232 buffer_FragColorbgra8[x*4+3] = d[3];
4235 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4237 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4238 for (x = startx;x < endx;x++)
4241 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4242 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4243 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4244 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4245 if (attenuation < 0.01f)
4247 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4249 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4250 if (attenuation < 0.01f)
4254 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4255 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4256 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4257 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4258 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4260 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4261 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4262 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4263 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4265 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4266 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4267 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4268 DPSOFTRAST_Vector3Normalize(surfacenormal);
4270 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4271 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4272 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4273 DPSOFTRAST_Vector3Normalize(lightnormal);
4275 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4276 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4278 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4279 attenuation *= (1.0f / 255.0f);
4280 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4281 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4282 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4283 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4287 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4288 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4289 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4290 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4292 buffer_FragColorbgra8[x*4+0] = d[0];
4293 buffer_FragColorbgra8[x*4+1] = d[1];
4294 buffer_FragColorbgra8[x*4+2] = d[2];
4295 buffer_FragColorbgra8[x*4+3] = d[3];
4300 for (x = startx;x < endx;x++)
4303 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4304 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4305 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4306 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4307 if (attenuation < 0.01f)
4309 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4311 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4312 if (attenuation < 0.01f)
4316 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4317 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4318 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4319 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4320 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4322 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4323 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4324 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4325 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4327 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4329 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4330 attenuation *= (1.0f / 255.0f);
4331 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4332 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4333 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4334 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4338 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4339 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4340 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4341 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4343 buffer_FragColorbgra8[x*4+0] = d[0];
4344 buffer_FragColorbgra8[x*4+1] = d[1];
4345 buffer_FragColorbgra8[x*4+2] = d[2];
4346 buffer_FragColorbgra8[x*4+3] = d[3];
4349 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4355 void DPSOFTRAST_VertexShader_Refraction(void)
4357 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4358 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4359 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4362 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4364 // DIRTY TRICK: only do sideways displacement. Not correct, but cheaper and thus better for SW.
4366 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4368 int x, startx = span->startx, endx = span->endx;
4371 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4372 //unsigned char buffer_texture_refractionbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4373 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4376 float ModelViewProjectionPositiondata[4];
4377 float ModelViewProjectionPositionslope[4];
4380 float ScreenScaleRefractReflect[2];
4381 float ScreenCenterRefractReflect[2];
4382 float DistortScaleRefractReflect[2];
4383 float RefractColor[4];
4385 const unsigned char * RESTRICT pixelbase;
4386 const unsigned char * RESTRICT pixel[4];
4387 DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4388 if(!texture) return;
4389 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[0][0];
4392 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4393 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4394 //DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_refractionbgra8, GL20TU_REFRACTION, DPSOFTRAST_ARRAY_TEXCOORD1, buffer_z);
4397 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
4400 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4401 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4402 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4403 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4404 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4405 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4406 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4407 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4408 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4409 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4412 for (x = startx;x < endx;x++)
4414 float SafeScreenTexCoord[2];
4415 float ScreenTexCoord[2];
4422 // " vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4423 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4425 // " vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4426 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4427 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4429 // " vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4430 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4431 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4432 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4433 DPSOFTRAST_Vector3Normalize(v);
4434 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4435 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4437 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4438 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4440 unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<16) - 32768, ScreenTexCoord[1] * (texture->mipmap[0][3]<<16) - 32678};
4441 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
4442 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
4443 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
4444 int tci[2] = { tc[0]>>16, tc[1]>>16 };
4445 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4446 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4447 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4448 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4449 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4450 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4451 pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci1[0]);
4452 pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci[0]);
4453 pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci1[0]);
4454 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
4455 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
4456 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
4460 int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2] - 0.5, ScreenTexCoord[1] * texture->mipmap[0][3] - 0.5 };
4461 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4462 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4463 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4464 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4465 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4466 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4472 //p = (int) bound(startx, x + (ScreenTexCoord[0] - SafeScreenTexCoord[0]) / (ModelViewProjectionPositionslope[0]*z), endx-1);
4473 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4474 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4475 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4476 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4479 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4484 void DPSOFTRAST_VertexShader_Water(void)
4486 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4490 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4493 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4494 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4495 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4496 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4497 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4502 void DPSOFTRAST_VertexShader_ShowDepth(void)
4504 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4507 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4510 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4511 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4512 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4513 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4514 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4519 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4521 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4524 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4527 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4528 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4529 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4530 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4531 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4536 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4538 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4541 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4544 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4545 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4546 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4547 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4548 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4553 typedef struct DPSOFTRAST_ShaderModeInfo_s
4556 void (*Vertex)(void);
4557 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4558 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4559 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4561 DPSOFTRAST_ShaderModeInfo;
4563 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4565 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4566 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4567 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4568 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4569 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4570 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4571 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4572 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4573 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4574 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4575 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4576 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4577 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4578 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4579 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4580 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4583 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4590 // unsigned int *colorpixel;
4591 unsigned int *depthpixel;
4597 DPSOFTRAST_State_Triangle *triangle;
4598 DPSOFTRAST_State_Span *span;
4599 unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4600 for (i = 0; i < thread->numspans; i++)
4602 span = &thread->spans[i];
4603 triangle = &thread->triangles[span->triangle];
4604 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4606 wslope = triangle->w[0];
4607 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4608 depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4609 depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4610 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4611 startx = span->startx;
4613 switch(thread->fb_depthfunc)
4616 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4617 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4618 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4619 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4620 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4621 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4622 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4624 //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4625 //for (x = startx;x < endx;x++)
4626 // colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4627 // if there is no color buffer, skip pixel shader
4628 while (startx < endx && !pixelmask[startx])
4630 while (endx > startx && !pixelmask[endx-1])
4633 continue; // no pixels to fill
4634 span->pixelmask = pixelmask;
4635 span->startx = startx;
4637 // run pixel shader if appropriate
4638 // do this before running depthmask code, to allow the pixelshader
4639 // to clear pixelmask values for alpha testing
4640 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4641 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4642 if (thread->depthmask)
4643 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4649 // no depth testing means we're just dealing with color...
4650 // if there is no color buffer, skip pixel shader
4651 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4653 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4654 span->pixelmask = pixelmask;
4655 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4659 thread->numspans = 0;
4662 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4664 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4667 int cullface = thread->cullface;
4668 int minx, maxx, miny, maxy;
4669 int miny1, maxy1, miny2, maxy2;
4670 __m128i fbmin, fbmax;
4671 __m128 viewportcenter, viewportscale;
4672 int firstvertex = command->firstvertex;
4673 int numvertices = command->numvertices;
4674 int numtriangles = command->numtriangles;
4675 const int *element3i = command->element3i;
4676 const unsigned short *element3s = command->element3s;
4677 int clipped = command->clipped;
4684 int starty, endy, bandy;
4688 __m128 triangleedge1, triangleedge2, trianglenormal;
4691 DPSOFTRAST_State_Triangle *triangle;
4692 DPSOFTRAST_Texture *texture;
4693 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4694 miny = thread->fb_scissor[1];
4695 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4696 miny1 = bound(miny, thread->miny1, maxy);
4697 maxy1 = bound(miny, thread->maxy1, maxy);
4698 miny2 = bound(miny, thread->miny2, maxy);
4699 maxy2 = bound(miny, thread->maxy2, maxy);
4700 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4702 if (!ATOMIC_DECREMENT(command->refcount))
4704 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4705 MM_FREE(command->arrays);
4709 minx = thread->fb_scissor[0];
4710 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4711 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4712 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4713 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4714 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4715 screen[3] = _mm_setzero_ps();
4716 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4717 for (i = 0;i < numtriangles;i++)
4719 const float *screencoord4f = command->arrays;
4720 const float *arrays = screencoord4f + numvertices*4;
4722 // generate the 3 edges of this triangle
4723 // generate spans for the triangle - switch based on left split or right split classification of triangle
4726 e[0] = element3s[i*3+0] - firstvertex;
4727 e[1] = element3s[i*3+1] - firstvertex;
4728 e[2] = element3s[i*3+2] - firstvertex;
4732 e[0] = element3i[i*3+0] - firstvertex;
4733 e[1] = element3i[i*3+1] - firstvertex;
4734 e[2] = element3i[i*3+2] - firstvertex;
4743 #define SKIPBACKFACE \
4744 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4745 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4746 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4747 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4748 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4752 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4756 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4761 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4762 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4764 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4765 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4767 #define CLIPPEDVERTEXCOPY(k,p1) \
4768 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4770 #define GENATTRIBCOPY(attrib, p1) \
4771 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4772 #define GENATTRIBLERP(attrib, p1, p2) \
4774 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4775 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4777 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4781 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4782 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4783 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4784 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4785 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4786 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4787 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4793 // calculate distance from nearplane
4794 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4795 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4796 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4797 if (clipdist[0] >= 0.0f)
4799 if (clipdist[1] >= 0.0f)
4801 if (clipdist[2] >= 0.0f)
4804 // triangle is entirely in front of nearplane
4805 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4812 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4820 if (clipdist[2] >= 0.0f)
4822 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4829 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4836 else if (clipdist[1] >= 0.0f)
4838 if (clipdist[2] >= 0.0f)
4840 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4847 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4853 else if (clipdist[2] >= 0.0f)
4855 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4860 else continue; // triangle is entirely behind nearplane
4863 // calculate integer y coords for triangle points
4864 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4865 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4866 screenmin = _mm_min_epi16(screeni, screenir),
4867 screenmax = _mm_max_epi16(screeni, screenir);
4868 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4869 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4870 screenmin = _mm_max_epi16(screenmin, fbmin);
4871 screenmax = _mm_min_epi16(screenmax, fbmax);
4872 // skip offscreen triangles
4873 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4875 starty = _mm_extract_epi16(screenmin, 1);
4876 endy = _mm_extract_epi16(screenmax, 1)+1;
4877 if (starty >= maxy1 && endy <= miny2)
4879 screeny = _mm_srai_epi32(screeni, 16);
4882 triangle = &thread->triangles[thread->numtriangles];
4884 // calculate attribute plans for triangle data...
4885 // okay, this triangle is going to produce spans, we'd better project
4886 // the interpolants now (this is what gives perspective texturing),
4887 // this consists of simply multiplying all arrays by the W coord
4888 // (which is basically 1/Z), which will be undone per-pixel
4889 // (multiplying by Z again) to get the perspective-correct array
4892 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4893 __m128 mipedgescale, mipdensity;
4894 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4895 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4896 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4897 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4898 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4899 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4900 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4901 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4902 attribedge1 = _mm_sub_ss(w0, w1);
4903 attribedge2 = _mm_sub_ss(w2, w1);
4904 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4905 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4906 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4907 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4908 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4909 _mm_store_ss(&triangle->w[0], attribxslope);
4910 _mm_store_ss(&triangle->w[1], attribyslope);
4911 _mm_store_ss(&triangle->w[2], attriborigin);
4912 mipedgescale = _mm_setzero_ps();
4913 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4915 __m128 attrib0, attrib1, attrib2;
4916 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4917 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4919 arrays += numvertices*4;
4920 GENATTRIBS(attrib0, attrib1, attrib2);
4921 attriborigin = _mm_mul_ps(attrib1, w1);
4922 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4923 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4924 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4925 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4926 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4927 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
4928 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
4929 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
4930 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4932 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4933 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4934 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4935 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4939 memset(triangle->mip, 0, sizeof(triangle->mip));
4940 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4942 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4943 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4945 texture = thread->texbound[texunit];
4946 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4948 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4949 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4950 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4951 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4952 // this will be multiplied in the texturing routine by the texture resolution
4953 y = _mm_cvtss_si32(mipdensity);
4956 y = (int)(log((float)y)*0.5f/M_LN2);
4957 if (y > texture->mipmaps - 1)
4958 y = texture->mipmaps - 1;
4959 triangle->mip[texunit] = y;
4965 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4968 __m128 xcoords, xslope;
4969 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4970 int yccmask = _mm_movemask_epi8(ycc);
4971 int edge0p, edge0n, edge1p, edge1n;
4978 case 0xFFFF: /*0000*/ y = endy; continue;
4979 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4980 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4981 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4982 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4983 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4984 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4985 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4986 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4987 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4988 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4989 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4990 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4991 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4992 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4993 case 0x0000: /*1111*/ y++; continue;
5001 case 0xFFFF: /*000*/ y = endy; continue;
5002 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5003 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5004 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5005 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5006 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5007 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5008 case 0x0000: /*111*/ y++; continue;
5011 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5012 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5013 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5014 nexty = _mm_extract_epi16(ycc, 0);
5015 if (nexty >= bandy) nexty = bandy-1;
5016 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5017 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5018 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5019 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5020 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5021 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5023 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5024 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5026 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
5028 int startx, endx, offset;
5029 startx = _mm_cvtss_si32(xcoords);
5030 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5033 if (startx < 0) startx = 0;
5034 startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
5036 if (endx > maxx) endx = maxx;
5037 if (startx >= endx) continue;
5038 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5040 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5041 span->triangle = thread->numtriangles;
5044 span->startx = max(minx - offset, 0);
5045 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5046 if (span->startx >= span->endx)
5048 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5049 DPSOFTRAST_Draw_ProcessSpans(thread);
5054 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5056 DPSOFTRAST_Draw_ProcessSpans(thread);
5057 thread->numtriangles = 0;
5061 if (!ATOMIC_DECREMENT(command->refcount))
5063 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5064 MM_FREE(command->arrays);
5067 if (thread->numspans > 0 || thread->numtriangles > 0)
5069 DPSOFTRAST_Draw_ProcessSpans(thread);
5070 thread->numtriangles = 0;
5075 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5079 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5080 int datasize = 2*numvertices*sizeof(float[4]);
5081 DPSOFTRAST_Command_Draw *command;
5082 unsigned char *data;
5083 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5085 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5086 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5088 datasize += numvertices*sizeof(float[4]);
5091 datasize += numtriangles*sizeof(unsigned short[3]);
5093 datasize += numtriangles*sizeof(int[3]);
5094 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5095 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5097 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5098 data = (unsigned char *)MM_CALLOC(datasize, 1);
5102 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5103 data = (unsigned char *)command + commandsize;
5105 command->firstvertex = firstvertex;
5106 command->numvertices = numvertices;
5107 command->numtriangles = numtriangles;
5108 command->arrays = (float *)data;
5109 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5110 dpsoftrast.firstvertex = firstvertex;
5111 dpsoftrast.numvertices = numvertices;
5112 dpsoftrast.screencoord4f = (float *)data;
5113 data += numvertices*sizeof(float[4]);
5114 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5115 data += numvertices*sizeof(float[4]);
5116 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5118 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5119 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5121 dpsoftrast.post_array4f[j] = (float *)data;
5122 data += numvertices*sizeof(float[4]);
5124 command->element3i = NULL;
5125 command->element3s = NULL;
5128 command->element3s = (unsigned short *)data;
5129 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5133 command->element3i = (int *)data;
5134 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5139 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5141 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5142 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5143 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5144 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5145 if (command->starty >= command->endy)
5147 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5148 MM_FREE(command->arrays);
5149 DPSOFTRAST_UndoCommand(command->commandsize);
5152 command->clipped = dpsoftrast.drawclipped;
5153 command->refcount = dpsoftrast.numthreads;
5155 if (dpsoftrast.usethreads)
5158 DPSOFTRAST_Draw_SyncCommands();
5159 for (i = 0; i < dpsoftrast.numthreads; i++)
5161 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5162 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5163 Thread_CondSignal(thread->drawcond);
5168 DPSOFTRAST_Draw_FlushThreads();
5172 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5174 int commandoffset = thread->commandoffset;
5175 while (commandoffset != endoffset)
5177 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5178 switch (command->opcode)
5180 #define INTERPCOMMAND(name) \
5181 case DPSOFTRAST_OPCODE_##name : \
5182 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5183 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5184 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5185 commandoffset = 0; \
5187 INTERPCOMMAND(Viewport)
5188 INTERPCOMMAND(ClearColor)
5189 INTERPCOMMAND(ClearDepth)
5190 INTERPCOMMAND(ColorMask)
5191 INTERPCOMMAND(DepthTest)
5192 INTERPCOMMAND(ScissorTest)
5193 INTERPCOMMAND(Scissor)
5194 INTERPCOMMAND(BlendFunc)
5195 INTERPCOMMAND(BlendSubtract)
5196 INTERPCOMMAND(DepthMask)
5197 INTERPCOMMAND(DepthFunc)
5198 INTERPCOMMAND(DepthRange)
5199 INTERPCOMMAND(PolygonOffset)
5200 INTERPCOMMAND(CullFace)
5201 INTERPCOMMAND(AlphaTest)
5202 INTERPCOMMAND(AlphaFunc)
5203 INTERPCOMMAND(SetTexture)
5204 INTERPCOMMAND(SetShader)
5205 INTERPCOMMAND(Uniform4f)
5206 INTERPCOMMAND(UniformMatrix4f)
5207 INTERPCOMMAND(Uniform1i)
5209 case DPSOFTRAST_OPCODE_Draw:
5210 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5211 commandoffset += command->commandsize;
5212 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5214 thread->commandoffset = commandoffset;
5217 case DPSOFTRAST_OPCODE_Reset:
5222 thread->commandoffset = commandoffset;
5225 static int DPSOFTRAST_Draw_Thread(void *data)
5227 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5228 while(thread->index >= 0)
5230 if (thread->commandoffset != dpsoftrast.drawcommand)
5232 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5236 Thread_LockMutex(thread->drawmutex);
5237 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5239 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5240 thread->starving = true;
5241 Thread_CondWait(thread->drawcond, thread->drawmutex);
5242 thread->starving = false;
5244 Thread_UnlockMutex(thread->drawmutex);
5250 static void DPSOFTRAST_Draw_FlushThreads(void)
5252 DPSOFTRAST_State_Thread *thread;
5254 DPSOFTRAST_Draw_SyncCommands();
5255 if (dpsoftrast.usethreads)
5257 for (i = 0; i < dpsoftrast.numthreads; i++)
5259 thread = &dpsoftrast.threads[i];
5260 if (thread->commandoffset != dpsoftrast.drawcommand)
5262 Thread_LockMutex(thread->drawmutex);
5263 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5264 Thread_CondSignal(thread->drawcond);
5265 Thread_UnlockMutex(thread->drawmutex);
5268 for (i = 0; i < dpsoftrast.numthreads; i++)
5270 thread = &dpsoftrast.threads[i];
5271 if (thread->commandoffset != dpsoftrast.drawcommand)
5273 Thread_LockMutex(thread->drawmutex);
5274 if (thread->commandoffset != dpsoftrast.drawcommand)
5276 thread->waiting = true;
5277 Thread_CondWait(thread->waitcond, thread->drawmutex);
5278 thread->waiting = false;
5280 Thread_UnlockMutex(thread->drawmutex);
5286 for (i = 0; i < dpsoftrast.numthreads; i++)
5288 thread = &dpsoftrast.threads[i];
5289 if (thread->commandoffset != dpsoftrast.drawcommand)
5290 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5293 dpsoftrast.commandpool.usedcommands = 0;
5296 void DPSOFTRAST_Flush(void)
5298 DPSOFTRAST_Draw_FlushThreads();
5301 void DPSOFTRAST_Finish(void)
5306 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5316 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5317 dpsoftrast.bigendian = u.b[3];
5318 dpsoftrast.fb_width = width;
5319 dpsoftrast.fb_height = height;
5320 dpsoftrast.fb_depthpixels = depthpixels;
5321 dpsoftrast.fb_colorpixels[0] = colorpixels;
5322 dpsoftrast.fb_colorpixels[1] = NULL;
5323 dpsoftrast.fb_colorpixels[1] = NULL;
5324 dpsoftrast.fb_colorpixels[1] = NULL;
5325 dpsoftrast.viewport[0] = 0;
5326 dpsoftrast.viewport[1] = 0;
5327 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5328 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5329 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5330 dpsoftrast.texture_firstfree = 1;
5331 dpsoftrast.texture_end = 1;
5332 dpsoftrast.texture_max = 0;
5333 dpsoftrast.color[0] = 1;
5334 dpsoftrast.color[1] = 1;
5335 dpsoftrast.color[2] = 1;
5336 dpsoftrast.color[3] = 1;
5337 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5338 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5339 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5340 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5341 for (i = 0; i < dpsoftrast.numthreads; i++)
5343 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5345 thread->cullface = GL_BACK;
5346 thread->colormask[1] = 1;
5347 thread->colormask[2] = 1;
5348 thread->colormask[3] = 1;
5349 thread->blendfunc[0] = GL_ONE;
5350 thread->blendfunc[1] = GL_ZERO;
5351 thread->depthmask = true;
5352 thread->depthtest = true;
5353 thread->depthfunc = GL_LEQUAL;
5354 thread->scissortest = false;
5355 thread->alphatest = false;
5356 thread->alphafunc = GL_GREATER;
5357 thread->alphavalue = 0.5f;
5358 thread->viewport[0] = 0;
5359 thread->viewport[1] = 0;
5360 thread->viewport[2] = dpsoftrast.fb_width;
5361 thread->viewport[3] = dpsoftrast.fb_height;
5362 thread->scissor[0] = 0;
5363 thread->scissor[1] = 0;
5364 thread->scissor[2] = dpsoftrast.fb_width;
5365 thread->scissor[3] = dpsoftrast.fb_height;
5366 thread->depthrange[0] = 0;
5367 thread->depthrange[1] = 1;
5368 thread->polygonoffset[0] = 0;
5369 thread->polygonoffset[1] = 0;
5371 if (dpsoftrast.interlace)
5373 thread->miny1 = (i*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5374 thread->maxy1 = ((i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5375 thread->miny2 = ((dpsoftrast.numthreads+i)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5376 thread->maxy2 = ((dpsoftrast.numthreads+i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5380 thread->miny1 = thread->miny2 = (i*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5381 thread->maxy1 = thread->maxy2 = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5384 thread->numspans = 0;
5385 thread->numtriangles = 0;
5386 thread->commandoffset = 0;
5387 thread->waiting = false;
5388 thread->starving = false;
5390 thread->validate = -1;
5391 DPSOFTRAST_Validate(thread, -1);
5393 if (dpsoftrast.usethreads)
5395 thread->waitcond = Thread_CreateCond();
5396 thread->drawcond = Thread_CreateCond();
5397 thread->drawmutex = Thread_CreateMutex();
5398 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5404 void DPSOFTRAST_Shutdown(void)
5407 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5409 DPSOFTRAST_State_Thread *thread;
5410 for (i = 0; i < dpsoftrast.numthreads; i++)
5412 thread = &dpsoftrast.threads[i];
5413 Thread_LockMutex(thread->drawmutex);
5415 Thread_CondSignal(thread->drawcond);
5416 Thread_UnlockMutex(thread->drawmutex);
5417 Thread_WaitThread(thread->thread, 0);
5418 Thread_DestroyCond(thread->waitcond);
5419 Thread_DestroyCond(thread->drawcond);
5420 Thread_DestroyMutex(thread->drawmutex);
5423 for (i = 0;i < dpsoftrast.texture_end;i++)
5424 if (dpsoftrast.texture[i].bytes)
5425 MM_FREE(dpsoftrast.texture[i].bytes);
5426 if (dpsoftrast.texture)
5427 free(dpsoftrast.texture);
5428 if (dpsoftrast.threads)
5429 MM_FREE(dpsoftrast.threads);
5430 memset(&dpsoftrast, 0, sizeof(dpsoftrast));