3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 #pragma warning(disable : 4324)
14 typedef qboolean bool;
18 #define ATOMIC_SIZE 32
21 #if defined(__APPLE__)
22 #include <libkern/OSAtomic.h>
23 #define ALIGN(var) var __attribute__((__aligned__(16)))
24 #define ATOMIC(var) var __attribute__((__aligned__(32)))
25 #define MEMORY_BARRIER (_mm_sfence())
26 #define ATOMIC_COUNTER volatile int32_t
27 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30 #elif defined(__GNUC__)
31 #define ALIGN(var) var __attribute__((__aligned__(16)))
32 #define ATOMIC(var) var __attribute__((__aligned__(32)))
33 #define MEMORY_BARRIER (_mm_sfence())
34 //(__sync_synchronize())
35 #define ATOMIC_COUNTER volatile int
36 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
37 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
38 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
39 #elif defined(_MSC_VER)
40 #define ALIGN(var) __declspec(align(16)) var
41 #define ATOMIC(var) __declspec(align(32)) var
42 #define MEMORY_BARRIER (_mm_sfence())
44 #define ATOMIC_COUNTER volatile LONG
45 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
46 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
47 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
52 #define ALIGN(var) var
55 #define ATOMIC(var) var
57 #ifndef MEMORY_BARRIER
58 #define MEMORY_BARRIER ((void)0)
60 #ifndef ATOMIC_COUNTER
61 #define ATOMIC_COUNTER int
63 #ifndef ATOMIC_INCREMENT
64 #define ATOMIC_INCREMENT(counter) (++(counter))
66 #ifndef ATOMIC_DECREMENT
67 #define ATOMIC_DECREMENT(counter) (--(counter))
70 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
74 #include <emmintrin.h>
76 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
78 static void *MM_CALLOC(size_t nmemb, size_t size)
80 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
81 if (ptr != NULL) memset(ptr, 0, nmemb*size);
85 #define MM_FREE _mm_free
87 #define MM_MALLOC(size) malloc(size)
88 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
92 typedef enum DPSOFTRAST_ARRAY_e
94 DPSOFTRAST_ARRAY_POSITION,
95 DPSOFTRAST_ARRAY_COLOR,
96 DPSOFTRAST_ARRAY_TEXCOORD0,
97 DPSOFTRAST_ARRAY_TEXCOORD1,
98 DPSOFTRAST_ARRAY_TEXCOORD2,
99 DPSOFTRAST_ARRAY_TEXCOORD3,
100 DPSOFTRAST_ARRAY_TEXCOORD4,
101 DPSOFTRAST_ARRAY_TEXCOORD5,
102 DPSOFTRAST_ARRAY_TEXCOORD6,
103 DPSOFTRAST_ARRAY_TEXCOORD7,
104 DPSOFTRAST_ARRAY_TOTAL
108 typedef struct DPSOFTRAST_Texture_s
115 DPSOFTRAST_TEXTURE_FILTER filter;
118 ATOMIC_COUNTER binds;
119 unsigned char *bytes;
120 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
124 #define COMMAND_SIZE ALIGN_SIZE
125 #define COMMAND_ALIGN(var) ALIGN(var)
127 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
129 unsigned char opcode;
130 unsigned short commandsize;
134 enum { DPSOFTRAST_OPCODE_Reset = 0 };
136 #define DEFCOMMAND(opcodeval, name, fields) \
137 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
138 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
140 unsigned char opcode; \
141 unsigned short commandsize; \
143 } DPSOFTRAST_Command_##name );
145 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
146 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
148 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
152 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
154 DPSOFTRAST_State_Command_Pool);
156 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
158 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
160 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
162 DPSOFTRAST_State_Triangle);
164 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
165 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
166 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
167 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
168 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
170 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
171 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
172 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
173 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
174 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
175 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
176 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
177 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
178 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
181 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
183 typedef ALIGN(struct DPSOFTRAST_State_Span_s
185 int triangle; // triangle this span was generated by
186 int x; // framebuffer x coord
187 int y; // framebuffer y coord
188 int startx; // usable range (according to pixelmask)
189 int endx; // usable range (according to pixelmask)
190 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
192 DPSOFTRAST_State_Span);
194 #define DPSOFTRAST_DRAW_MAXSPANS 1024
195 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
197 #define DPSOFTRAST_VALIDATE_FB 1
198 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
199 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
200 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
202 typedef enum DPSOFTRAST_BLENDMODE_e
204 DPSOFTRAST_BLENDMODE_OPAQUE,
205 DPSOFTRAST_BLENDMODE_ALPHA,
206 DPSOFTRAST_BLENDMODE_ADDALPHA,
207 DPSOFTRAST_BLENDMODE_ADD,
208 DPSOFTRAST_BLENDMODE_INVMOD,
209 DPSOFTRAST_BLENDMODE_MUL,
210 DPSOFTRAST_BLENDMODE_MUL2,
211 DPSOFTRAST_BLENDMODE_SUBALPHA,
212 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
213 DPSOFTRAST_BLENDMODE_INVADD,
214 DPSOFTRAST_BLENDMODE_TOTAL
216 DPSOFTRAST_BLENDMODE;
218 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
237 float polygonoffset[2];
238 ALIGN(float clipplane[4]);
241 int shader_permutation;
242 int shader_exactspecularmath;
244 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
246 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
247 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
249 // DPSOFTRAST_VALIDATE_ flags
252 // derived values (DPSOFTRAST_VALIDATE_FB)
255 ALIGN(float fb_viewportcenter[4]);
256 ALIGN(float fb_viewportscale[4]);
258 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
261 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
270 ATOMIC(volatile int commandoffset);
272 volatile bool waiting;
273 volatile bool starving;
280 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
281 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
283 DPSOFTRAST_State_Thread);
285 typedef ATOMIC(struct DPSOFTRAST_State_s
289 unsigned int *fb_depthpixels;
290 unsigned int *fb_colorpixels[4];
293 ALIGN(float fb_viewportcenter[4]);
294 ALIGN(float fb_viewportscale[4]);
297 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
298 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
300 const float *pointer_vertex3f;
301 const float *pointer_color4f;
302 const unsigned char *pointer_color4ub;
303 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
306 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
307 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
308 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
312 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
313 float *screencoord4f;
319 int shader_permutation;
320 int shader_exactspecularmath;
324 int texture_firstfree;
325 DPSOFTRAST_Texture *texture;
330 const char *errorstring;
335 DPSOFTRAST_State_Thread *threads;
337 ATOMIC(volatile int drawcommand);
339 DPSOFTRAST_State_Command_Pool commandpool;
343 DPSOFTRAST_State dpsoftrast;
345 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
346 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
347 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
348 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
349 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
351 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
353 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
354 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
355 fb_viewportcenter[3] = 0.5f;
356 fb_viewportcenter[0] = 0.0f;
357 fb_viewportscale[1] = 0.5f * viewport[2];
358 fb_viewportscale[2] = -0.5f * viewport[3];
359 fb_viewportscale[3] = 0.5f;
360 fb_viewportscale[0] = 1.0f;
363 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
365 if (dpsoftrast.interlace)
367 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
368 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
369 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
370 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
374 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
375 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
379 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
381 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
382 // and viewport projection values
385 x1 = thread->scissor[0];
386 x2 = thread->scissor[0] + thread->scissor[2];
387 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
388 y2 = dpsoftrast.fb_height - thread->scissor[1];
389 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
391 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
393 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
394 thread->fb_scissor[0] = x1;
395 thread->fb_scissor[1] = y1;
396 thread->fb_scissor[2] = x2 - x1;
397 thread->fb_scissor[3] = y2 - y1;
399 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
400 DPSOFTRAST_RecalcThread(thread);
403 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
405 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
408 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
410 if (thread->blendsubtract)
412 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
414 #define BLENDFUNC(sfactor, dfactor, blendmode) \
415 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
416 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
417 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
422 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
424 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
425 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
426 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
427 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
428 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
429 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
430 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
431 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
432 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
433 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
434 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
439 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
441 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
443 mask &= thread->validate;
446 if (mask & DPSOFTRAST_VALIDATE_FB)
448 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
449 DPSOFTRAST_RecalcFB(thread);
451 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
453 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
454 DPSOFTRAST_RecalcDepthFunc(thread);
456 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
458 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
459 DPSOFTRAST_RecalcBlendFunc(thread);
463 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
465 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
466 return &dpsoftrast.texture[index];
470 static void DPSOFTRAST_Texture_Grow(void)
472 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
473 DPSOFTRAST_State_Thread *thread;
477 // expand texture array as needed
478 if (dpsoftrast.texture_max < 1024)
479 dpsoftrast.texture_max = 1024;
481 dpsoftrast.texture_max *= 2;
482 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
483 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
484 if (dpsoftrast.texbound[i])
485 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
486 for (j = 0; j < dpsoftrast.numthreads; j++)
488 thread = &dpsoftrast.threads[j];
489 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
490 if (thread->texbound[i])
491 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
495 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
504 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
505 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
506 DPSOFTRAST_Texture *texture;
507 if (width*height*depth < 1)
509 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
512 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
514 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
519 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
520 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
521 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
523 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
524 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
526 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
531 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
534 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
536 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
541 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
543 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
546 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
548 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
551 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
553 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
556 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
558 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
561 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
563 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
566 // find first empty slot in texture array
567 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
568 if (!dpsoftrast.texture[texnum].bytes)
570 dpsoftrast.texture_firstfree = texnum + 1;
571 if (dpsoftrast.texture_max <= texnum)
572 DPSOFTRAST_Texture_Grow();
573 if (dpsoftrast.texture_end <= texnum)
574 dpsoftrast.texture_end = texnum + 1;
575 texture = &dpsoftrast.texture[texnum];
576 memset(texture, 0, sizeof(*texture));
577 texture->flags = flags;
578 texture->width = width;
579 texture->height = height;
580 texture->depth = depth;
581 texture->sides = sides;
593 s = w * h * d * sides * 4;
594 texture->mipmap[mipmaps][0] = size;
595 texture->mipmap[mipmaps][1] = s;
596 texture->mipmap[mipmaps][2] = w;
597 texture->mipmap[mipmaps][3] = h;
598 texture->mipmap[mipmaps][4] = d;
601 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
607 texture->mipmaps = mipmaps;
608 texture->size = size;
610 // allocate the pixels now
611 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
615 void DPSOFTRAST_Texture_Free(int index)
617 DPSOFTRAST_Texture *texture;
618 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
622 MM_FREE(texture->bytes);
623 texture->bytes = NULL;
624 memset(texture, 0, sizeof(*texture));
625 // adjust the free range and used range
626 if (dpsoftrast.texture_firstfree > index)
627 dpsoftrast.texture_firstfree = index;
628 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
629 dpsoftrast.texture_end--;
631 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
633 int i, x, y, z, w, layer0, layer1, row0, row1;
634 unsigned char *o, *i0, *i1, *i2, *i3;
635 DPSOFTRAST_Texture *texture;
636 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
637 if (texture->mipmaps <= 1)
639 for (i = 1;i < texture->mipmaps;i++)
641 for (z = 0;z < texture->mipmap[i][4];z++)
645 if (layer1 >= texture->mipmap[i-1][4])
646 layer1 = texture->mipmap[i-1][4]-1;
647 for (y = 0;y < texture->mipmap[i][3];y++)
651 if (row1 >= texture->mipmap[i-1][3])
652 row1 = texture->mipmap[i-1][3]-1;
653 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
654 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
655 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
656 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
657 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
658 w = texture->mipmap[i][2];
661 if (texture->mipmap[i-1][2] > 1)
663 // average 3D texture
664 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
666 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
667 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
668 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
669 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
674 // average 3D mipmap with parent width == 1
675 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
677 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
678 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
679 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
680 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
686 if (texture->mipmap[i-1][2] > 1)
688 // average 2D texture (common case)
689 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
691 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
692 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
693 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
694 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
699 // 2D texture with parent width == 1
700 o[0] = (i0[0] + i1[0] + 1) >> 1;
701 o[1] = (i0[1] + i1[1] + 1) >> 1;
702 o[2] = (i0[2] + i1[2] + 1) >> 1;
703 o[3] = (i0[3] + i1[3] + 1) >> 1;
710 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
712 DPSOFTRAST_Texture *texture;
714 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
719 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
720 while (blockheight > 0)
722 memcpy(dst, pixels, blockwidth * 4);
723 pixels += blockwidth * 4;
724 dst += texture->mipmap[0][2] * 4;
728 DPSOFTRAST_Texture_CalculateMipmaps(index);
730 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
732 DPSOFTRAST_Texture *texture;
733 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
737 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
738 DPSOFTRAST_Texture_CalculateMipmaps(index);
740 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
742 DPSOFTRAST_Texture *texture;
743 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
744 return texture->mipmap[mip][2];
746 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
748 DPSOFTRAST_Texture *texture;
749 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
750 return texture->mipmap[mip][3];
752 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
754 DPSOFTRAST_Texture *texture;
755 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
756 return texture->mipmap[mip][4];
758 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
760 DPSOFTRAST_Texture *texture;
761 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
764 return texture->bytes + texture->mipmap[mip][0];
766 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
768 DPSOFTRAST_Texture *texture;
769 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
770 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
772 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
777 texture->filter = filter;
780 static void DPSOFTRAST_Draw_FlushThreads(void);
782 static void DPSOFTRAST_Draw_SyncCommands(void)
784 if(dpsoftrast.usethreads) MEMORY_BARRIER;
785 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
788 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
790 DPSOFTRAST_State_Thread *thread;
792 int freecommand = dpsoftrast.commandpool.freecommand;
793 int usedcommands = dpsoftrast.commandpool.usedcommands;
794 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
796 DPSOFTRAST_Draw_SyncCommands();
802 for (i = 0; i < dpsoftrast.numthreads; i++)
804 thread = &dpsoftrast.threads[i];
805 commandoffset = freecommand - thread->commandoffset;
806 if (commandoffset < 0)
807 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
808 if (commandoffset > usedcommands)
811 usedcommands = commandoffset;
814 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
816 thread = &dpsoftrast.threads[waitindex];
817 Thread_LockMutex(thread->drawmutex);
818 if (thread->commandoffset != dpsoftrast.drawcommand)
820 thread->waiting = true;
821 if (thread->starving) Thread_CondSignal(thread->drawcond);
822 Thread_CondWait(thread->waitcond, thread->drawmutex);
823 thread->waiting = false;
825 Thread_UnlockMutex(thread->drawmutex);
827 dpsoftrast.commandpool.usedcommands = usedcommands;
830 #define DPSOFTRAST_ALIGNCOMMAND(size) \
831 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
832 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
833 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
835 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
837 DPSOFTRAST_Command *command;
838 int freecommand = dpsoftrast.commandpool.freecommand;
839 int usedcommands = dpsoftrast.commandpool.usedcommands;
840 int extra = sizeof(DPSOFTRAST_Command);
841 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
842 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
843 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
845 if (dpsoftrast.usethreads)
846 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
848 DPSOFTRAST_Draw_FlushThreads();
849 freecommand = dpsoftrast.commandpool.freecommand;
850 usedcommands = dpsoftrast.commandpool.usedcommands;
852 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
854 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
855 command->opcode = DPSOFTRAST_OPCODE_Reset;
856 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
859 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
860 command->opcode = opcode;
861 command->commandsize = size;
863 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
865 dpsoftrast.commandpool.freecommand = freecommand;
866 dpsoftrast.commandpool.usedcommands = usedcommands + size;
870 static void DPSOFTRAST_UndoCommand(int size)
872 int freecommand = dpsoftrast.commandpool.freecommand;
873 int usedcommands = dpsoftrast.commandpool.usedcommands;
876 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
877 usedcommands -= size;
878 dpsoftrast.commandpool.freecommand = freecommand;
879 dpsoftrast.commandpool.usedcommands = usedcommands;
882 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
883 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
885 thread->viewport[0] = command->x;
886 thread->viewport[1] = command->y;
887 thread->viewport[2] = command->width;
888 thread->viewport[3] = command->height;
889 thread->validate |= DPSOFTRAST_VALIDATE_FB;
891 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
893 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
896 command->width = width;
897 command->height = height;
899 dpsoftrast.viewport[0] = x;
900 dpsoftrast.viewport[1] = y;
901 dpsoftrast.viewport[2] = width;
902 dpsoftrast.viewport[3] = height;
903 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
906 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
907 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
909 int i, x1, y1, x2, y2, w, h, x, y;
910 int miny1, maxy1, miny2, maxy2;
914 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
915 miny1 = thread->miny1;
916 maxy1 = thread->maxy1;
917 miny2 = thread->miny2;
918 maxy2 = thread->maxy2;
919 x1 = thread->fb_scissor[0];
920 y1 = thread->fb_scissor[1];
921 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
922 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
923 if (y1 < miny1) y1 = miny1;
924 if (y2 > maxy2) y2 = maxy2;
929 // FIXME: honor fb_colormask?
930 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
931 for (i = 0;i < 4;i++)
933 if (!dpsoftrast.fb_colorpixels[i])
935 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
938 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
939 for (x = x1;x < x2;x++)
944 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
946 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
953 DEFCOMMAND(3, ClearDepth, float depth;)
954 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
956 int x1, y1, x2, y2, w, h, x, y;
957 int miny1, maxy1, miny2, maxy2;
961 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
962 miny1 = thread->miny1;
963 maxy1 = thread->maxy1;
964 miny2 = thread->miny2;
965 maxy2 = thread->maxy2;
966 x1 = thread->fb_scissor[0];
967 y1 = thread->fb_scissor[1];
968 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
969 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
970 if (y1 < miny1) y1 = miny1;
971 if (y2 > maxy2) y2 = maxy2;
976 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
977 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
980 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
981 for (x = x1;x < x2;x++)
985 void DPSOFTRAST_ClearDepth(float d)
987 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
991 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
992 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
994 thread->colormask[0] = command->r != 0;
995 thread->colormask[1] = command->g != 0;
996 thread->colormask[2] = command->b != 0;
997 thread->colormask[3] = command->a != 0;
998 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1000 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1002 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1009 DEFCOMMAND(5, DepthTest, int enable;)
1010 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1012 thread->depthtest = command->enable;
1013 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1015 void DPSOFTRAST_DepthTest(int enable)
1017 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1018 command->enable = enable;
1021 DEFCOMMAND(6, ScissorTest, int enable;)
1022 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1024 thread->scissortest = command->enable;
1025 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1027 void DPSOFTRAST_ScissorTest(int enable)
1029 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1030 command->enable = enable;
1033 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1034 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1036 thread->scissor[0] = command->x;
1037 thread->scissor[1] = command->y;
1038 thread->scissor[2] = command->width;
1039 thread->scissor[3] = command->height;
1040 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1042 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1044 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1047 command->width = width;
1048 command->height = height;
1051 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1052 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1054 thread->blendfunc[0] = command->sfactor;
1055 thread->blendfunc[1] = command->dfactor;
1056 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1058 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1060 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1061 command->sfactor = sfactor;
1062 command->dfactor = dfactor;
1065 DEFCOMMAND(9, BlendSubtract, int enable;)
1066 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1068 thread->blendsubtract = command->enable;
1069 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1071 void DPSOFTRAST_BlendSubtract(int enable)
1073 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1074 command->enable = enable;
1077 DEFCOMMAND(10, DepthMask, int enable;)
1078 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1080 thread->depthmask = command->enable;
1082 void DPSOFTRAST_DepthMask(int enable)
1084 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1085 command->enable = enable;
1088 DEFCOMMAND(11, DepthFunc, int func;)
1089 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1091 thread->depthfunc = command->func;
1093 void DPSOFTRAST_DepthFunc(int func)
1095 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1096 command->func = func;
1099 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1100 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1102 thread->depthrange[0] = command->nearval;
1103 thread->depthrange[1] = command->farval;
1105 void DPSOFTRAST_DepthRange(float nearval, float farval)
1107 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1108 command->nearval = nearval;
1109 command->farval = farval;
1112 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1113 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1115 thread->polygonoffset[0] = command->alongnormal;
1116 thread->polygonoffset[1] = command->intoview;
1118 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1120 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1121 command->alongnormal = alongnormal;
1122 command->intoview = intoview;
1125 DEFCOMMAND(14, CullFace, int mode;)
1126 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1128 thread->cullface = command->mode;
1130 void DPSOFTRAST_CullFace(int mode)
1132 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1133 command->mode = mode;
1136 DEFCOMMAND(15, AlphaTest, int enable;)
1137 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1139 thread->alphatest = command->enable;
1141 void DPSOFTRAST_AlphaTest(int enable)
1143 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1144 command->enable = enable;
1147 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1148 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1150 thread->alphafunc = command->func;
1151 thread->alphavalue = command->ref;
1153 void DPSOFTRAST_AlphaFunc(int func, float ref)
1155 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1156 command->func = func;
1160 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1162 dpsoftrast.color[0] = r;
1163 dpsoftrast.color[1] = g;
1164 dpsoftrast.color[2] = b;
1165 dpsoftrast.color[3] = a;
1168 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1170 int outstride = blockwidth * 4;
1171 int instride = dpsoftrast.fb_width * 4;
1174 int bx2 = blockx + blockwidth;
1175 int by2 = blocky + blockheight;
1179 unsigned char *inpixels;
1183 if (bx1 < 0) bx1 = 0;
1184 if (by1 < 0) by1 = 0;
1185 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1186 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1188 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1189 if (dpsoftrast.bigendian)
1191 for (y = by1;y < by2;y++)
1193 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1194 o = (unsigned char *)outpixels + (y - by1) * outstride;
1195 for (x = bx1;x < bx2;x++)
1208 for (y = by1;y < by2;y++)
1210 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1211 o = (unsigned char *)outpixels + (y - by1) * outstride;
1217 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1221 int tx2 = tx + width;
1222 int ty2 = ty + height;
1225 int sx2 = sx + width;
1226 int sy2 = sy + height;
1236 unsigned int *spixels;
1237 unsigned int *tpixels;
1238 DPSOFTRAST_Texture *texture;
1239 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1240 if (mip < 0 || mip >= texture->mipmaps) return;
1242 spixels = dpsoftrast.fb_colorpixels[0];
1243 swidth = dpsoftrast.fb_width;
1244 sheight = dpsoftrast.fb_height;
1245 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1246 twidth = texture->mipmap[mip][2];
1247 theight = texture->mipmap[mip][3];
1248 if (tx1 < 0) tx1 = 0;
1249 if (ty1 < 0) ty1 = 0;
1250 if (tx2 > twidth) tx2 = twidth;
1251 if (ty2 > theight) ty2 = theight;
1252 if (sx1 < 0) sx1 = 0;
1253 if (sy1 < 0) sy1 = 0;
1254 if (sx2 > swidth) sx2 = swidth;
1255 if (sy2 > sheight) sy2 = sheight;
1260 if (tw > sw) tw = sw;
1261 if (th > sh) th = sh;
1262 if (tw < 1 || th < 1)
1264 sy1 = sheight - 1 - sy1;
1265 for (y = 0;y < th;y++)
1266 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1267 if (texture->mipmaps > 1)
1268 DPSOFTRAST_Texture_CalculateMipmaps(index);
1271 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1272 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1274 if (thread->texbound[command->unitnum])
1275 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1276 thread->texbound[command->unitnum] = command->texture;
1278 void DPSOFTRAST_SetTexture(int unitnum, int index)
1280 DPSOFTRAST_Command_SetTexture *command;
1281 DPSOFTRAST_Texture *texture;
1282 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1284 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1287 texture = DPSOFTRAST_Texture_GetByIndex(index);
1288 if (index && !texture)
1290 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1294 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1295 command->unitnum = unitnum;
1296 command->texture = texture;
1298 dpsoftrast.texbound[unitnum] = texture;
1299 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1302 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1304 dpsoftrast.pointer_vertex3f = vertex3f;
1305 dpsoftrast.stride_vertex = stride;
1307 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1309 dpsoftrast.pointer_color4f = color4f;
1310 dpsoftrast.pointer_color4ub = NULL;
1311 dpsoftrast.stride_color = stride;
1313 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1315 dpsoftrast.pointer_color4f = NULL;
1316 dpsoftrast.pointer_color4ub = color4ub;
1317 dpsoftrast.stride_color = stride;
1319 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1321 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1322 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1323 dpsoftrast.stride_texcoord[unitnum] = stride;
1326 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1327 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1329 thread->shader_mode = command->mode;
1330 thread->shader_permutation = command->permutation;
1331 thread->shader_exactspecularmath = command->exactspecularmath;
1333 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1335 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1336 command->mode = mode;
1337 command->permutation = permutation;
1338 command->exactspecularmath = exactspecularmath;
1340 dpsoftrast.shader_mode = mode;
1341 dpsoftrast.shader_permutation = permutation;
1342 dpsoftrast.shader_exactspecularmath = exactspecularmath;
1345 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1346 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1348 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1350 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1352 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1353 command->index = index;
1354 command->val[0] = v0;
1355 command->val[1] = v1;
1356 command->val[2] = v2;
1357 command->val[3] = v3;
1359 dpsoftrast.uniform4f[index*4+0] = v0;
1360 dpsoftrast.uniform4f[index*4+1] = v1;
1361 dpsoftrast.uniform4f[index*4+2] = v2;
1362 dpsoftrast.uniform4f[index*4+3] = v3;
1364 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1366 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1367 command->index = index;
1368 memcpy(command->val, v, sizeof(command->val));
1370 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1373 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1374 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1376 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1378 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1382 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1384 __m128 m0, m1, m2, m3;
1385 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1386 command->index = (DPSOFTRAST_UNIFORM)index;
1387 if (((size_t)v)&(ALIGN_SIZE-1))
1389 m0 = _mm_loadu_ps(v);
1390 m1 = _mm_loadu_ps(v+4);
1391 m2 = _mm_loadu_ps(v+8);
1392 m3 = _mm_loadu_ps(v+12);
1396 m0 = _mm_load_ps(v);
1397 m1 = _mm_load_ps(v+4);
1398 m2 = _mm_load_ps(v+8);
1399 m3 = _mm_load_ps(v+12);
1403 __m128 t0, t1, t2, t3;
1404 t0 = _mm_unpacklo_ps(m0, m1);
1405 t1 = _mm_unpacklo_ps(m2, m3);
1406 t2 = _mm_unpackhi_ps(m0, m1);
1407 t3 = _mm_unpackhi_ps(m2, m3);
1408 m0 = _mm_movelh_ps(t0, t1);
1409 m1 = _mm_movehl_ps(t1, t0);
1410 m2 = _mm_movelh_ps(t2, t3);
1411 m3 = _mm_movehl_ps(t3, t2);
1413 _mm_store_ps(command->val, m0);
1414 _mm_store_ps(command->val+4, m1);
1415 _mm_store_ps(command->val+8, m2);
1416 _mm_store_ps(command->val+12, m3);
1417 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1418 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1419 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1420 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1425 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1426 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1428 thread->uniform1i[command->index] = command->val;
1430 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1432 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1433 command->index = index;
1436 dpsoftrast.uniform1i[command->index] = i0;
1439 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1440 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1442 memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1444 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1446 DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1447 x /= dpsoftrast.fb_viewportscale[1];
1448 y /= dpsoftrast.fb_viewportscale[2];
1449 z /= dpsoftrast.fb_viewportscale[3];
1450 w /= dpsoftrast.fb_viewportscale[0];
1451 w -= dpsoftrast.fb_viewportcenter[1]*x + dpsoftrast.fb_viewportcenter[2]*y + dpsoftrast.fb_viewportcenter[3]*z + dpsoftrast.fb_viewportcenter[0]*w;
1452 command->clipplane[0] = x;
1453 command->clipplane[1] = y;
1454 command->clipplane[2] = z;
1455 command->clipplane[3] = w;
1459 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1461 float *end = dst + size*4;
1462 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1466 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1475 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1482 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1484 float *end = dst + size*4;
1485 if (stride == sizeof(float[3]))
1487 float *end4 = dst + (size&~3)*4;
1488 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1492 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1493 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1494 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1495 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1496 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1497 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1498 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1499 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1500 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1501 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1502 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1503 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1504 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1506 src += 4*sizeof(float[3]);
1513 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1514 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1515 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1516 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1517 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1518 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1519 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1520 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1521 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1522 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1523 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1524 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1525 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1527 src += 4*sizeof(float[3]);
1531 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1535 __m128 v = _mm_loadu_ps((const float *)src);
1536 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1537 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1538 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1539 _mm_store_ps(dst, v);
1548 __m128 v = _mm_load_ps((const float *)src);
1549 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1550 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1551 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1552 _mm_store_ps(dst, v);
1559 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1561 float *end = dst + size*4;
1562 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1563 if (stride == sizeof(float[2]))
1565 float *end2 = dst + (size&~1)*4;
1566 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1570 __m128 v = _mm_loadu_ps((const float *)src);
1571 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1572 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1574 src += 2*sizeof(float[2]);
1581 __m128 v = _mm_load_ps((const float *)src);
1582 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1583 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1585 src += 2*sizeof(float[2]);
1591 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1597 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1599 float *end = dst + size*4;
1600 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1601 if (stride == sizeof(unsigned char[4]))
1603 float *end4 = dst + (size&~3)*4;
1604 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1608 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1609 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1610 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1611 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1612 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1614 src += 4*sizeof(unsigned char[4]);
1621 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1622 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1623 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1624 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1625 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1627 src += 4*sizeof(unsigned char[4]);
1633 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1634 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1640 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1642 float *end = dst + 4*size;
1643 __m128 v = _mm_loadu_ps(src);
1646 _mm_store_ps(dst, v);
1652 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1655 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1656 __m128 m0, m1, m2, m3;
1658 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1660 // fast case for identity matrix
1661 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1664 end = out4f + numitems*4;
1665 m0 = _mm_loadu_ps(inmatrix16f);
1666 m1 = _mm_loadu_ps(inmatrix16f + 4);
1667 m2 = _mm_loadu_ps(inmatrix16f + 8);
1668 m3 = _mm_loadu_ps(inmatrix16f + 12);
1669 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1673 __m128 v = _mm_loadu_ps(in4f);
1675 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1676 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1677 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1678 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1687 __m128 v = _mm_load_ps(in4f);
1689 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1690 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1691 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1692 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1700 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1702 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1706 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1708 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1709 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1710 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1711 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1714 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1716 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1717 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1718 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1719 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1722 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1725 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1726 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1727 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1728 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1731 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1733 int clipmask = 0xFF;
1734 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1735 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1736 __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1737 __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1738 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1739 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1740 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1741 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1742 #define BBFRONT(k, pos) \
1744 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1745 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1746 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1749 clipmask &= ~(1<<k); \
1750 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1751 minproj = _mm_min_ss(minproj, proj); \
1752 maxproj = _mm_max_ss(maxproj, proj); \
1756 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1757 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1758 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1759 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1760 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1761 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1765 if (clipmask&(1<<k)) \
1767 if (!(clipmask&(1<<(k^1)))) \
1769 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1770 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1771 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1772 minproj = _mm_min_ss(minproj, proj); \
1773 maxproj = _mm_max_ss(maxproj, proj); \
1775 if (!(clipmask&(1<<(k^2)))) \
1777 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1778 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1779 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1780 minproj = _mm_min_ss(minproj, proj); \
1781 maxproj = _mm_max_ss(maxproj, proj); \
1783 if (!(clipmask&(1<<(k^4)))) \
1785 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1786 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1787 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1788 minproj = _mm_min_ss(minproj, proj); \
1789 maxproj = _mm_max_ss(maxproj, proj); \
1793 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1794 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1795 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1796 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1797 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1798 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1799 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1800 *starty = _mm_cvttss_si32(maxproj);
1801 *endy = _mm_cvttss_si32(minproj)+1;
1805 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1807 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1808 float *end = out4f + numitems*4;
1809 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1810 __m128 minpos, maxpos;
1811 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1813 minpos = maxpos = _mm_loadu_ps(in4f);
1816 __m128 v = _mm_loadu_ps(in4f);
1817 minpos = _mm_min_ps(minpos, v);
1818 maxpos = _mm_max_ps(maxpos, v);
1819 _mm_store_ps(out4f, v);
1820 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1821 _mm_store_ps(screen4f, v);
1829 minpos = maxpos = _mm_load_ps(in4f);
1832 __m128 v = _mm_load_ps(in4f);
1833 minpos = _mm_min_ps(minpos, v);
1834 maxpos = _mm_max_ps(maxpos, v);
1835 _mm_store_ps(out4f, v);
1836 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1837 _mm_store_ps(screen4f, v);
1845 ALIGN(float minposf[4]);
1846 ALIGN(float maxposf[4]);
1847 _mm_store_ps(minposf, minpos);
1848 _mm_store_ps(maxposf, maxpos);
1849 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1854 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1856 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1857 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1859 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1860 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1861 end = out4f + numitems*4;
1862 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1863 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1864 m0 = _mm_loadu_ps(inmatrix16f);
1865 m1 = _mm_loadu_ps(inmatrix16f + 4);
1866 m2 = _mm_loadu_ps(inmatrix16f + 8);
1867 m3 = _mm_loadu_ps(inmatrix16f + 12);
1868 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1870 minpos = maxpos = _mm_loadu_ps(in4f);
1873 __m128 v = _mm_loadu_ps(in4f);
1874 minpos = _mm_min_ps(minpos, v);
1875 maxpos = _mm_max_ps(maxpos, v);
1876 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1877 _mm_store_ps(out4f, v);
1878 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1879 _mm_store_ps(screen4f, v);
1887 minpos = maxpos = _mm_load_ps(in4f);
1890 __m128 v = _mm_load_ps(in4f);
1891 minpos = _mm_min_ps(minpos, v);
1892 maxpos = _mm_max_ps(maxpos, v);
1893 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1894 _mm_store_ps(out4f, v);
1895 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1896 _mm_store_ps(screen4f, v);
1904 ALIGN(float minposf[4]);
1905 ALIGN(float maxposf[4]);
1906 _mm_store_ps(minposf, minpos);
1907 _mm_store_ps(maxposf, maxpos);
1908 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f);
1914 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1917 float *outf = dpsoftrast.post_array4f[outarray];
1918 const unsigned char *inb;
1919 int firstvertex = dpsoftrast.firstvertex;
1920 int numvertices = dpsoftrast.numvertices;
1924 case DPSOFTRAST_ARRAY_POSITION:
1925 stride = dpsoftrast.stride_vertex;
1926 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1927 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1929 case DPSOFTRAST_ARRAY_COLOR:
1930 stride = dpsoftrast.stride_color;
1931 if (dpsoftrast.pointer_color4f)
1933 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1934 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1936 else if (dpsoftrast.pointer_color4ub)
1938 stride = dpsoftrast.stride_color;
1939 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1940 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1944 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1948 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1949 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1951 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1952 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1955 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1958 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1961 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1973 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1975 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1976 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1981 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1984 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1985 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1993 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1996 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1997 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2004 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2007 int startx = span->startx;
2008 int endx = span->endx;
2009 float wslope = triangle->w[0];
2010 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2011 float endz = 1.0f / (w + wslope * startx);
2012 if (triangle->w[0] == 0)
2014 // LordHavoc: fast flat polygons (HUD/menu)
2015 for (x = startx;x < endx;x++)
2019 for (x = startx;x < endx;)
2021 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2023 if (nextsub >= endx) nextsub = endsub = endx-1;
2024 endz = 1.0f / (w + wslope * nextsub);
2025 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2026 for (; x <= endsub; x++, z += dz)
2031 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
2034 int startx = span->startx;
2035 int endx = span->endx;
2038 unsigned char * RESTRICT pixelmask = span->pixelmask;
2039 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2042 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2043 // handle alphatest now (this affects depth writes too)
2044 if (thread->alphatest)
2045 for (x = startx;x < endx;x++)
2046 if (in4f[x*4+3] < 0.5f)
2047 pixelmask[x] = false;
2048 // FIXME: this does not handle bigendian
2049 switch(thread->fb_blendmode)
2051 case DPSOFTRAST_BLENDMODE_OPAQUE:
2052 for (x = startx;x < endx;x++)
2056 d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
2057 d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
2058 d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
2059 d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
2060 pixel[x*4+0] = d[0];
2061 pixel[x*4+1] = d[1];
2062 pixel[x*4+2] = d[2];
2063 pixel[x*4+3] = d[3];
2066 case DPSOFTRAST_BLENDMODE_ALPHA:
2067 for (x = startx;x < endx;x++)
2071 a = in4f[x*4+3] * 255.0f;
2072 b = 1.0f - in4f[x*4+3];
2073 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2074 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2075 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2076 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2077 pixel[x*4+0] = d[0];
2078 pixel[x*4+1] = d[1];
2079 pixel[x*4+2] = d[2];
2080 pixel[x*4+3] = d[3];
2083 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2084 for (x = startx;x < endx;x++)
2088 a = in4f[x*4+3] * 255.0f;
2089 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2090 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2091 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2092 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2093 pixel[x*4+0] = d[0];
2094 pixel[x*4+1] = d[1];
2095 pixel[x*4+2] = d[2];
2096 pixel[x*4+3] = d[3];
2099 case DPSOFTRAST_BLENDMODE_ADD:
2100 for (x = startx;x < endx;x++)
2104 d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2105 d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2106 d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2107 d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2108 pixel[x*4+0] = d[0];
2109 pixel[x*4+1] = d[1];
2110 pixel[x*4+2] = d[2];
2111 pixel[x*4+3] = d[3];
2114 case DPSOFTRAST_BLENDMODE_INVMOD:
2115 for (x = startx;x < endx;x++)
2119 d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2120 d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2121 d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2122 d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2123 pixel[x*4+0] = d[0];
2124 pixel[x*4+1] = d[1];
2125 pixel[x*4+2] = d[2];
2126 pixel[x*4+3] = d[3];
2129 case DPSOFTRAST_BLENDMODE_MUL:
2130 for (x = startx;x < endx;x++)
2134 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2135 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2136 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2137 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2138 pixel[x*4+0] = d[0];
2139 pixel[x*4+1] = d[1];
2140 pixel[x*4+2] = d[2];
2141 pixel[x*4+3] = d[3];
2144 case DPSOFTRAST_BLENDMODE_MUL2:
2145 for (x = startx;x < endx;x++)
2149 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2150 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2151 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2152 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2153 pixel[x*4+0] = d[0];
2154 pixel[x*4+1] = d[1];
2155 pixel[x*4+2] = d[2];
2156 pixel[x*4+3] = d[3];
2159 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2160 for (x = startx;x < endx;x++)
2164 a = in4f[x*4+3] * -255.0f;
2165 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2166 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2167 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2168 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2169 pixel[x*4+0] = d[0];
2170 pixel[x*4+1] = d[1];
2171 pixel[x*4+2] = d[2];
2172 pixel[x*4+3] = d[3];
2175 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2176 for (x = startx;x < endx;x++)
2181 b = 1.0f - in4f[x*4+3];
2182 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2183 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2184 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2185 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2186 pixel[x*4+0] = d[0];
2187 pixel[x*4+1] = d[1];
2188 pixel[x*4+2] = d[2];
2189 pixel[x*4+3] = d[3];
2192 case DPSOFTRAST_BLENDMODE_INVADD:
2193 for (x = startx;x < endx;x++)
2197 d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2198 d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2199 d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2200 d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2201 pixel[x*4+0] = d[0];
2202 pixel[x*4+1] = d[1];
2203 pixel[x*4+2] = d[2];
2204 pixel[x*4+3] = d[3];
2210 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2214 int startx = span->startx;
2215 int endx = span->endx;
2217 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2218 unsigned char * RESTRICT pixelmask = span->pixelmask;
2219 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2220 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2223 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2224 pixeli += span->y * dpsoftrast.fb_width + span->x;
2225 // handle alphatest now (this affects depth writes too)
2226 if (thread->alphatest)
2227 for (x = startx;x < endx;x++)
2228 if (in4ub[x*4+3] < 128)
2229 pixelmask[x] = false;
2230 // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2231 // helps sprites, text and hud artwork
2232 switch(thread->fb_blendmode)
2234 case DPSOFTRAST_BLENDMODE_ALPHA:
2235 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2236 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2237 for (x = startx;x < endx;x++)
2238 if (in4ub[x*4+3] < 1)
2239 pixelmask[x] = false;
2241 case DPSOFTRAST_BLENDMODE_OPAQUE:
2242 case DPSOFTRAST_BLENDMODE_ADD:
2243 case DPSOFTRAST_BLENDMODE_INVMOD:
2244 case DPSOFTRAST_BLENDMODE_MUL:
2245 case DPSOFTRAST_BLENDMODE_MUL2:
2246 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2247 case DPSOFTRAST_BLENDMODE_INVADD:
2250 // put some special values at the end of the mask to ensure the loops end
2251 pixelmask[endx] = 1;
2252 pixelmask[endx+1] = 0;
2253 // LordHavoc: use a double loop to identify subspans, this helps the
2254 // optimized copy/blend loops to perform at their best, most triangles
2255 // have only one run of pixels, and do the search using wide reads...
2259 // if this pixel is masked off, it's probably not alone...
2266 // the 4-item search must be aligned or else it stalls badly
2267 if ((x & 3) && !pixelmask[x]) x++;
2268 if ((x & 3) && !pixelmask[x]) x++;
2269 if ((x & 3) && !pixelmask[x]) x++;
2270 while (*((unsigned int *)pixelmask + x) == 0x00000000)
2274 for (;!pixelmask[x];x++)
2276 // rather than continue the loop, just check the end variable
2280 // find length of subspan
2285 if ((subx & 3) && pixelmask[subx]) subx++;
2286 if ((subx & 3) && pixelmask[subx]) subx++;
2287 if ((subx & 3) && pixelmask[subx]) subx++;
2288 while (*((unsigned int *)pixelmask + subx) == 0x01010101)
2292 for (;pixelmask[subx];subx++)
2294 // the checks can overshoot, so make sure to clip it...
2297 // now that we know the subspan length... process!
2298 switch(thread->fb_blendmode)
2300 case DPSOFTRAST_BLENDMODE_OPAQUE:
2304 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2309 while (x + 16 <= subx)
2311 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2312 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2313 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2314 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2319 while (x + 4 <= subx)
2321 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2327 pixeli[x+1] = ini[x+1];
2337 case DPSOFTRAST_BLENDMODE_ALPHA:
2338 #define FINISHBLEND(blend2, blend1) \
2339 for (;x + 1 < subx;x += 2) \
2342 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2343 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2345 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2350 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2351 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2353 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2357 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2358 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2360 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2361 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2364 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2366 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2367 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2369 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2370 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2373 case DPSOFTRAST_BLENDMODE_ADD:
2374 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2376 case DPSOFTRAST_BLENDMODE_INVMOD:
2378 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2380 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2383 case DPSOFTRAST_BLENDMODE_MUL:
2384 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2386 case DPSOFTRAST_BLENDMODE_MUL2:
2387 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2389 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2391 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2392 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2394 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2395 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2398 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2400 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2401 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2403 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2404 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2407 case DPSOFTRAST_BLENDMODE_INVADD:
2409 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2411 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2419 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2422 int startx = span->startx;
2423 int endx = span->endx;
2428 float tc[2], endtc[2];
2430 unsigned int tci[2];
2431 unsigned int tci1[2];
2432 unsigned int tcimin[2];
2433 unsigned int tcimax[2];
2438 const unsigned char * RESTRICT pixelbase;
2439 const unsigned char * RESTRICT pixel[4];
2440 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2441 // if no texture is bound, just fill it with white
2444 for (x = startx;x < endx;x++)
2446 out4f[x*4+0] = 1.0f;
2447 out4f[x*4+1] = 1.0f;
2448 out4f[x*4+2] = 1.0f;
2449 out4f[x*4+3] = 1.0f;
2453 mip = triangle->mip[texunitindex];
2454 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2455 // if this mipmap of the texture is 1 pixel, just fill it with that color
2456 if (texture->mipmap[mip][1] == 4)
2458 c[0] = texture->bytes[2] * (1.0f/255.0f);
2459 c[1] = texture->bytes[1] * (1.0f/255.0f);
2460 c[2] = texture->bytes[0] * (1.0f/255.0f);
2461 c[3] = texture->bytes[3] * (1.0f/255.0f);
2462 for (x = startx;x < endx;x++)
2464 out4f[x*4+0] = c[0];
2465 out4f[x*4+1] = c[1];
2466 out4f[x*4+2] = c[2];
2467 out4f[x*4+3] = c[3];
2471 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2472 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2473 flags = texture->flags;
2474 tcscale[0] = texture->mipmap[mip][2];
2475 tcscale[1] = texture->mipmap[mip][3];
2476 tciwidth = texture->mipmap[mip][2];
2479 tcimax[0] = texture->mipmap[mip][2]-1;
2480 tcimax[1] = texture->mipmap[mip][3]-1;
2481 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2482 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2483 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2484 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2490 for (x = startx;x < endx;)
2492 unsigned int subtc[2];
2493 unsigned int substep[2];
2494 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2495 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2496 if (nextsub >= endx)
2498 nextsub = endsub = endx-1;
2499 if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2503 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2504 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2510 substep[0] = (endtc[0] - tc[0]) * subscale;
2511 substep[1] = (endtc[1] - tc[1]) * subscale;
2512 subtc[0] = tc[0] * (1<<12);
2513 subtc[1] = tc[1] * (1<<12);
2516 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2518 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2520 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2521 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2522 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2523 tci[0] = subtc[0]>>12;
2524 tci[1] = subtc[1]>>12;
2525 tci1[0] = tci[0] + 1;
2526 tci1[1] = tci[1] + 1;
2527 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2528 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2529 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2530 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2531 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2532 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2533 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2534 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2535 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2536 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2537 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2538 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2539 out4f[x*4+0] = c[0];
2540 out4f[x*4+1] = c[1];
2541 out4f[x*4+2] = c[2];
2542 out4f[x*4+3] = c[3];
2547 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2549 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2550 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2551 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2552 tci[0] = subtc[0]>>12;
2553 tci[1] = subtc[1]>>12;
2554 tci1[0] = tci[0] + 1;
2555 tci1[1] = tci[1] + 1;
2556 tci[0] &= tciwrapmask[0];
2557 tci[1] &= tciwrapmask[1];
2558 tci1[0] &= tciwrapmask[0];
2559 tci1[1] &= tciwrapmask[1];
2560 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2561 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2562 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2563 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2564 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2565 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2566 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2567 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2568 out4f[x*4+0] = c[0];
2569 out4f[x*4+1] = c[1];
2570 out4f[x*4+2] = c[2];
2571 out4f[x*4+3] = c[3];
2575 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2577 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2579 tci[0] = subtc[0]>>12;
2580 tci[1] = subtc[1]>>12;
2581 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2582 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2583 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2584 c[0] = pixel[0][2] * (1.0f / 255.0f);
2585 c[1] = pixel[0][1] * (1.0f / 255.0f);
2586 c[2] = pixel[0][0] * (1.0f / 255.0f);
2587 c[3] = pixel[0][3] * (1.0f / 255.0f);
2588 out4f[x*4+0] = c[0];
2589 out4f[x*4+1] = c[1];
2590 out4f[x*4+2] = c[2];
2591 out4f[x*4+3] = c[3];
2596 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2598 tci[0] = subtc[0]>>12;
2599 tci[1] = subtc[1]>>12;
2600 tci[0] &= tciwrapmask[0];
2601 tci[1] &= tciwrapmask[1];
2602 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2603 c[0] = pixel[0][2] * (1.0f / 255.0f);
2604 c[1] = pixel[0][1] * (1.0f / 255.0f);
2605 c[2] = pixel[0][0] * (1.0f / 255.0f);
2606 c[3] = pixel[0][3] * (1.0f / 255.0f);
2607 out4f[x*4+0] = c[0];
2608 out4f[x*4+1] = c[1];
2609 out4f[x*4+2] = c[2];
2610 out4f[x*4+3] = c[3];
2616 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2620 int startx = span->startx;
2621 int endx = span->endx;
2623 __m128 data, slope, tcscale;
2624 __m128i tcsize, tcmask, tcoffset, tcmax;
2626 __m128i subtc, substep, endsubtc;
2629 int affine; // LordHavoc: optimized affine texturing case
2630 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2631 const unsigned char * RESTRICT pixelbase;
2632 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2633 // if no texture is bound, just fill it with white
2636 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2639 mip = triangle->mip[texunitindex];
2640 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2641 // if this mipmap of the texture is 1 pixel, just fill it with that color
2642 if (texture->mipmap[mip][1] == 4)
2644 unsigned int k = *((const unsigned int *)pixelbase);
2645 for (x = startx;x < endx;x++)
2649 affine = zf[startx] == zf[endx-1];
2650 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2651 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2652 flags = texture->flags;
2653 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2654 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2655 tcscale = _mm_cvtepi32_ps(tcsize);
2656 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2657 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2658 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2660 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2661 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2662 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2663 tcmax = _mm_packs_epi32(tcmask, tcmask);
2664 for (x = startx;x < endx;)
2666 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2667 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2668 if (nextsub >= endx || affine)
2670 nextsub = endsub = endx-1;
2671 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2675 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2677 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2678 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2679 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2680 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2681 substep = _mm_slli_epi32(substep, 1);
2684 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2685 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2687 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2688 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2690 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2691 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2692 tci = _mm_madd_epi16(tci, tcoffset);
2693 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2694 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2695 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2696 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2697 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2698 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2699 fracm = _mm_srli_epi16(subtc, 1);
2700 pix1 = _mm_add_epi16(pix1,
2701 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2702 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2703 pix3 = _mm_add_epi16(pix3,
2704 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2705 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2706 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2707 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2708 pix2 = _mm_add_epi16(pix2,
2709 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2710 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2711 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2715 const unsigned char * RESTRICT ptr1;
2716 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2717 tci = _mm_madd_epi16(tci, tcoffset);
2718 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2719 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2720 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2721 fracm = _mm_srli_epi16(subtc, 1);
2722 pix1 = _mm_add_epi16(pix1,
2723 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2724 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2725 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2726 pix1 = _mm_add_epi16(pix1,
2727 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2728 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2729 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2733 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2735 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2737 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2738 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2739 tci = _mm_madd_epi16(tci, tcoffset);
2740 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2741 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2742 _mm_setzero_si128());
2743 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2744 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2745 _mm_setzero_si128());
2746 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2747 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2748 tci = _mm_madd_epi16(tci, tcoffset);
2749 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2750 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2751 _mm_setzero_si128());
2752 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2753 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2754 _mm_setzero_si128());
2755 fracm = _mm_srli_epi16(subtc, 1);
2756 pix1 = _mm_add_epi16(pix1,
2757 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2758 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2759 pix3 = _mm_add_epi16(pix3,
2760 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2761 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2762 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2763 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2764 pix2 = _mm_add_epi16(pix2,
2765 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2766 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2767 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2771 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2772 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2773 tci = _mm_madd_epi16(tci, tcoffset);
2774 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2775 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2776 _mm_setzero_si128());
2777 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2778 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2779 _mm_setzero_si128());
2780 fracm = _mm_srli_epi16(subtc, 1);
2781 pix1 = _mm_add_epi16(pix1,
2782 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2783 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2784 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2785 pix1 = _mm_add_epi16(pix1,
2786 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2787 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2788 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2794 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2796 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2797 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2798 tci = _mm_madd_epi16(tci, tcoffset);
2799 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2800 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2801 _mm_setzero_si128());
2802 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2803 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2804 _mm_setzero_si128());
2805 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2806 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2807 tci = _mm_madd_epi16(tci, tcoffset);
2808 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2809 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2810 _mm_setzero_si128());
2811 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2812 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2813 _mm_setzero_si128());
2814 fracm = _mm_srli_epi16(subtc, 1);
2815 pix1 = _mm_add_epi16(pix1,
2816 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2817 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2818 pix3 = _mm_add_epi16(pix3,
2819 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2820 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2821 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2822 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2823 pix2 = _mm_add_epi16(pix2,
2824 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2825 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2826 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2830 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2831 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2832 tci = _mm_madd_epi16(tci, tcoffset);
2833 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2834 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2835 _mm_setzero_si128());
2836 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2837 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2838 _mm_setzero_si128());
2839 fracm = _mm_srli_epi16(subtc, 1);
2840 pix1 = _mm_add_epi16(pix1,
2841 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2842 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2843 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2844 pix1 = _mm_add_epi16(pix1,
2845 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2846 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2847 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2854 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2856 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2858 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2859 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2860 tci = _mm_madd_epi16(tci, tcoffset);
2861 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2862 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2866 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2867 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2868 tci = _mm_madd_epi16(tci, tcoffset);
2869 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2875 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2877 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2878 tci = _mm_and_si128(tci, tcmax);
2879 tci = _mm_madd_epi16(tci, tcoffset);
2880 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2881 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2885 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2886 tci = _mm_and_si128(tci, tcmax);
2887 tci = _mm_madd_epi16(tci, tcoffset);
2888 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2897 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2900 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2903 float DPSOFTRAST_SampleShadowmap(const float *vector)
2909 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2912 int startx = span->startx;
2913 int endx = span->endx;
2918 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2919 for (x = startx;x < endx;x++)
2922 c[0] = (data[0] + slope[0]*x) * z;
2923 c[1] = (data[1] + slope[1]*x) * z;
2924 c[2] = (data[2] + slope[2]*x) * z;
2925 c[3] = (data[3] + slope[3]*x) * z;
2926 out4f[x*4+0] = in4f[x*4+0] * c[0];
2927 out4f[x*4+1] = in4f[x*4+1] * c[1];
2928 out4f[x*4+2] = in4f[x*4+2] * c[2];
2929 out4f[x*4+3] = in4f[x*4+3] * c[3];
2933 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2936 int startx = span->startx;
2937 int endx = span->endx;
2942 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2943 for (x = startx;x < endx;x++)
2946 c[0] = (data[0] + slope[0]*x) * z;
2947 c[1] = (data[1] + slope[1]*x) * z;
2948 c[2] = (data[2] + slope[2]*x) * z;
2949 c[3] = (data[3] + slope[3]*x) * z;
2950 out4f[x*4+0] = c[0];
2951 out4f[x*4+1] = c[1];
2952 out4f[x*4+2] = c[2];
2953 out4f[x*4+3] = c[3];
2957 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2959 int x, startx = span->startx, endx = span->endx;
2960 float c[4], localcolor[4];
2961 localcolor[0] = subcolor[0];
2962 localcolor[1] = subcolor[1];
2963 localcolor[2] = subcolor[2];
2964 localcolor[3] = subcolor[3];
2965 for (x = startx;x < endx;x++)
2967 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2968 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2969 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2970 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2971 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2972 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2973 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2974 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2978 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2980 int x, startx = span->startx, endx = span->endx;
2981 for (x = startx;x < endx;x++)
2983 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2984 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2985 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2986 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2990 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2992 int x, startx = span->startx, endx = span->endx;
2993 for (x = startx;x < endx;x++)
2995 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2996 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2997 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2998 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
3002 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
3004 int x, startx = span->startx, endx = span->endx;
3006 for (x = startx;x < endx;x++)
3008 a = 1.0f - inb4f[x*4+3];
3010 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
3011 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
3012 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
3013 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
3017 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
3019 int x, startx = span->startx, endx = span->endx;
3020 float localcolor[4], ilerp, lerp;
3021 localcolor[0] = color[0];
3022 localcolor[1] = color[1];
3023 localcolor[2] = color[2];
3024 localcolor[3] = color[3];
3025 ilerp = 1.0f - localcolor[3];
3026 lerp = localcolor[3];
3027 for (x = startx;x < endx;x++)
3029 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
3030 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
3031 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
3032 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
3038 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
3042 int startx = span->startx;
3043 int endx = span->endx;
3046 __m128i submod, substep, endsubmod;
3047 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3048 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3049 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3050 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3051 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3052 for (x = startx; x < endx;)
3054 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3055 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3056 if (nextsub >= endx)
3058 nextsub = endsub = endx-1;
3059 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
3063 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3064 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3065 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3066 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3067 substep = _mm_packs_epi32(substep, substep);
3068 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3070 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3071 pix = _mm_mulhi_epu16(pix, submod);
3072 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3076 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3077 pix = _mm_mulhi_epu16(pix, submod);
3078 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3085 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3089 int startx = span->startx;
3090 int endx = span->endx;
3093 __m128i submod, substep, endsubmod;
3094 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3095 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3096 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3097 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3098 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3099 for (x = startx; x < endx;)
3101 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3102 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3103 if (nextsub >= endx)
3105 nextsub = endsub = endx-1;
3106 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3110 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3111 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3112 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3113 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3114 substep = _mm_packs_epi32(substep, substep);
3115 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3117 __m128i pix = _mm_srai_epi16(submod, 4);
3118 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3122 __m128i pix = _mm_srai_epi16(submod, 4);
3123 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3130 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3133 int x, startx = span->startx, endx = span->endx;
3134 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3135 localcolor = _mm_packs_epi32(localcolor, localcolor);
3136 for (x = startx;x+2 <= endx;x+=2)
3138 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3139 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3140 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3141 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3145 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3146 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3147 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3148 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3153 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3156 int x, startx = span->startx, endx = span->endx;
3157 for (x = startx;x+2 <= endx;x+=2)
3159 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3160 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3161 pix1 = _mm_mulhi_epu16(pix1, pix2);
3162 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3166 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3167 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3168 pix1 = _mm_mulhi_epu16(pix1, pix2);
3169 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3174 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3177 int x, startx = span->startx, endx = span->endx;
3178 for (x = startx;x+2 <= endx;x+=2)
3180 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3181 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3182 pix1 = _mm_add_epi16(pix1, pix2);
3183 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3187 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3188 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3189 pix1 = _mm_add_epi16(pix1, pix2);
3190 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3195 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3198 int x, startx = span->startx, endx = span->endx;
3199 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3200 tint = _mm_packs_epi32(tint, tint);
3201 for (x = startx;x+2 <= endx;x+=2)
3203 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3204 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3205 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3206 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3210 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3211 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3212 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3213 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3218 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3221 int x, startx = span->startx, endx = span->endx;
3222 for (x = startx;x+2 <= endx;x+=2)
3224 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3225 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3226 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3227 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3228 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3232 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3233 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3234 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3235 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3236 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3241 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3244 int x, startx = span->startx, endx = span->endx;
3245 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3246 localcolor = _mm_packs_epi32(localcolor, localcolor);
3247 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3248 for (x = startx;x+2 <= endx;x+=2)
3250 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3251 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3252 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3256 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3257 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3258 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3265 void DPSOFTRAST_VertexShader_Generic(void)
3267 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3268 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3269 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3270 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3271 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3274 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3276 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3277 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3278 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3279 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3280 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3281 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3283 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3284 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3285 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3287 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3288 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3291 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3293 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3296 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3298 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3301 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3306 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3307 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3312 void DPSOFTRAST_VertexShader_PostProcess(void)
3314 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3315 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3316 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3319 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3321 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3322 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3323 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3324 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3325 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3326 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3327 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3329 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3330 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3332 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3333 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3335 // TODO: implement saturation
3337 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3339 // TODO: implement gammaramps
3341 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3346 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3348 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3351 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3353 // this is never called (because colormask is off when this shader is used)
3354 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3355 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3356 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3357 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3358 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3363 void DPSOFTRAST_VertexShader_FlatColor(void)
3365 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3366 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3369 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3372 unsigned char * RESTRICT pixelmask = span->pixelmask;
3373 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3374 int x, startx = span->startx, endx = span->endx;
3375 __m128i Color_Ambientm;
3376 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3377 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3378 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3379 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3380 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3381 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3382 pixel = buffer_FragColorbgra8;
3383 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3384 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3385 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3386 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3387 for (x = startx;x < endx;x++)
3390 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3393 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3394 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3395 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3396 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3402 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3403 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3404 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3406 if (pixel == buffer_FragColorbgra8)
3407 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3413 void DPSOFTRAST_VertexShader_VertexColor(void)
3415 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3416 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3417 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3420 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3423 unsigned char * RESTRICT pixelmask = span->pixelmask;
3424 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3425 int x, startx = span->startx, endx = span->endx;
3426 __m128i Color_Ambientm, Color_Diffusem;
3428 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3429 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3430 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3431 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3432 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3433 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3434 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3435 pixel = buffer_FragColorbgra8;
3436 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3437 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3438 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3439 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3440 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3441 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3442 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3443 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3444 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3445 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3446 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3447 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3448 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3449 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3451 __m128i color, mod, pix;
3452 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3455 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3456 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3457 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3458 data = _mm_add_ps(data, slope);
3459 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3460 data = _mm_add_ps(data, slope);
3461 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3462 data = _mm_add_ps(data, slope);
3463 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3464 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3465 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3466 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3467 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3468 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3474 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3475 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3476 mod = _mm_packs_epi32(mod, mod);
3477 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3478 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3480 if (pixel == buffer_FragColorbgra8)
3481 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3487 void DPSOFTRAST_VertexShader_Lightmap(void)
3489 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3490 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3491 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3494 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3497 unsigned char * RESTRICT pixelmask = span->pixelmask;
3498 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3499 int x, startx = span->startx, endx = span->endx;
3500 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3501 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3502 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3503 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3504 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3505 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3506 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3507 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3508 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3509 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3510 pixel = buffer_FragColorbgra8;
3511 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3512 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3513 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3514 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3515 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3516 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3517 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3518 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3520 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3521 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3522 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3523 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3524 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3525 for (x = startx;x < endx;x++)
3527 __m128i color, lightmap, glow, pix;
3528 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3531 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3532 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3533 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3534 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3535 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3536 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3537 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3538 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3539 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3540 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3546 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3547 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3548 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3549 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3550 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3551 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3556 for (x = startx;x < endx;x++)
3558 __m128i color, lightmap, pix;
3559 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3562 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3563 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3564 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3565 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3566 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3567 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3568 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3574 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3575 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3576 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3577 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3580 if (pixel == buffer_FragColorbgra8)
3581 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3586 void DPSOFTRAST_VertexShader_LightDirection(void);
3587 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3589 void DPSOFTRAST_VertexShader_FakeLight(void)
3591 DPSOFTRAST_VertexShader_LightDirection();
3594 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3596 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3601 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3603 DPSOFTRAST_VertexShader_LightDirection();
3604 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3607 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3609 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3614 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3616 DPSOFTRAST_VertexShader_LightDirection();
3617 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3620 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3622 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3627 void DPSOFTRAST_VertexShader_LightDirection(void)
3630 int numvertices = dpsoftrast.numvertices;
3632 float LightVector[4];
3633 float EyePosition[4];
3634 float EyeVectorModelSpace[4];
3640 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3641 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3642 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3643 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3644 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3645 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3646 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3647 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3648 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3649 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3650 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3651 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3652 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3653 for (i = 0;i < numvertices;i++)
3655 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3656 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3657 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3658 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3659 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3660 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3661 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3662 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3663 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3664 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3665 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3666 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3667 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3668 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3669 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3670 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3671 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3672 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3673 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3674 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3675 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3676 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3677 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3678 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3679 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3680 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3681 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3682 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3683 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3685 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3688 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3689 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3690 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3691 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3692 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3693 #define DPSOFTRAST_Vector3Normalize(v)\
3696 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3707 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3709 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3710 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3711 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3712 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3713 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3714 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3715 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3716 unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3717 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3718 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3719 int x, startx = span->startx, endx = span->endx;
3720 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3721 float LightVectordata[4];
3722 float LightVectorslope[4];
3723 float EyeVectordata[4];
3724 float EyeVectorslope[4];
3725 float VectorSdata[4];
3726 float VectorSslope[4];
3727 float VectorTdata[4];
3728 float VectorTslope[4];
3729 float VectorRdata[4];
3730 float VectorRslope[4];
3732 float diffusetex[4];
3734 float surfacenormal[4];
3735 float lightnormal[4];
3736 float lightnormal_modelspace[4];
3738 float specularnormal[4];
3741 float SpecularPower;
3743 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3744 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3745 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3746 Color_Glow[3] = 0.0f;
3747 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3748 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3749 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3750 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3751 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3752 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3753 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3754 Color_Pants[3] = 0.0f;
3755 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3756 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3757 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3758 Color_Shirt[3] = 0.0f;
3759 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3760 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3761 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3763 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3764 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3766 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3768 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3770 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3772 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3773 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3774 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3775 Color_Diffuse[3] = 0.0f;
3776 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3777 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3778 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3779 LightColor[3] = 0.0f;
3780 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3781 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3782 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3783 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3784 Color_Specular[3] = 0.0f;
3785 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3786 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3787 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3789 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3791 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3792 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3793 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3794 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3795 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3797 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3799 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3800 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3802 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3804 // nothing of this needed
3808 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3811 for (x = startx;x < endx;x++)
3814 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3815 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3816 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3817 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3818 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3820 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3821 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3822 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3823 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3825 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3826 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3827 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3828 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3829 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3830 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3831 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3832 DPSOFTRAST_Vector3Normalize(surfacenormal);
3834 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3836 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3837 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3838 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3839 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3841 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3842 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3843 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3844 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3846 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3847 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3848 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3849 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3851 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3852 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3853 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3854 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3856 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3857 DPSOFTRAST_Vector3Normalize(lightnormal);
3859 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3861 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3862 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3863 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3864 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3867 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3869 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3870 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3871 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3873 float f = 1.0f / 256.0f;
3874 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3875 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3876 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3879 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3881 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3882 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3883 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3884 DPSOFTRAST_Vector3Normalize(lightnormal);
3886 LightColor[0] = 1.0;
3887 LightColor[1] = 1.0;
3888 LightColor[2] = 1.0;
3892 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3893 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3894 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3895 DPSOFTRAST_Vector3Normalize(lightnormal);
3898 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3900 if(thread->shader_exactspecularmath)
3902 // reflect lightnormal at surfacenormal, take the negative of that
3903 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3905 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3906 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3907 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3908 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3910 // dot of this and normalize(EyeVectorFogDepth.xyz)
3911 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3912 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3913 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3914 DPSOFTRAST_Vector3Normalize(eyenormal);
3916 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3920 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3921 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3922 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3923 DPSOFTRAST_Vector3Normalize(eyenormal);
3925 specularnormal[0] = lightnormal[0] + eyenormal[0];
3926 specularnormal[1] = lightnormal[1] + eyenormal[1];
3927 specularnormal[2] = lightnormal[2] + eyenormal[2];
3928 DPSOFTRAST_Vector3Normalize(specularnormal);
3930 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3933 specular = pow(specular, SpecularPower * glosstex[3]);
3934 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3936 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3937 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3938 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3939 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3943 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3944 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3945 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3946 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3949 buffer_FragColorbgra8[x*4+0] = d[0];
3950 buffer_FragColorbgra8[x*4+1] = d[1];
3951 buffer_FragColorbgra8[x*4+2] = d[2];
3952 buffer_FragColorbgra8[x*4+3] = d[3];
3955 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3957 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3958 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3959 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3960 Color_Diffuse[3] = 0.0f;
3961 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3962 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3963 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3964 LightColor[3] = 0.0f;
3965 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3967 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3969 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3970 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3971 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3972 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3973 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3975 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3977 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3978 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3980 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3982 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3986 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3989 for (x = startx;x < endx;x++)
3992 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3993 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3994 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3995 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3996 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3997 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3998 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3999 DPSOFTRAST_Vector3Normalize(surfacenormal);
4001 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
4003 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
4004 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4005 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4006 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4008 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
4009 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
4010 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
4011 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
4013 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
4014 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
4015 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
4016 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
4018 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
4019 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
4020 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
4021 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
4023 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
4024 DPSOFTRAST_Vector3Normalize(lightnormal);
4026 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
4028 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
4029 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
4030 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
4031 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
4034 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
4036 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4037 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4038 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4040 float f = 1.0f / 256.0f;
4041 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
4042 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
4043 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
4046 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
4048 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4049 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4050 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4051 DPSOFTRAST_Vector3Normalize(lightnormal);
4053 LightColor[0] = 1.0;
4054 LightColor[1] = 1.0;
4055 LightColor[2] = 1.0;
4059 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4060 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4061 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4062 DPSOFTRAST_Vector3Normalize(lightnormal);
4065 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4066 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4068 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4069 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4070 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4071 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4075 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4076 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4077 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4078 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4080 buffer_FragColorbgra8[x*4+0] = d[0];
4081 buffer_FragColorbgra8[x*4+1] = d[1];
4082 buffer_FragColorbgra8[x*4+2] = d[2];
4083 buffer_FragColorbgra8[x*4+3] = d[3];
4088 for (x = startx;x < endx;x++)
4091 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4092 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4093 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4094 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4096 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4098 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4099 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4100 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4101 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4105 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4106 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4107 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4108 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4110 buffer_FragColorbgra8[x*4+0] = d[0];
4111 buffer_FragColorbgra8[x*4+1] = d[1];
4112 buffer_FragColorbgra8[x*4+2] = d[2];
4113 buffer_FragColorbgra8[x*4+3] = d[3];
4116 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4121 void DPSOFTRAST_VertexShader_LightSource(void)
4124 int numvertices = dpsoftrast.numvertices;
4125 float LightPosition[4];
4126 float LightVector[4];
4127 float LightVectorModelSpace[4];
4128 float EyePosition[4];
4129 float EyeVectorModelSpace[4];
4135 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4136 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4137 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4138 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4139 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4140 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4141 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4142 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4143 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4144 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4145 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4146 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4147 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4148 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4149 for (i = 0;i < numvertices;i++)
4151 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4152 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4153 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4154 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4155 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4156 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4157 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4158 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4159 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4160 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4161 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4162 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4163 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4164 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4165 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4166 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4167 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4168 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
4169 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4170 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4171 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4172 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4173 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4174 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4175 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4176 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4177 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4178 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4179 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4180 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4181 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4182 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4184 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4185 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4188 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4191 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4192 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4193 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4194 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4195 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4196 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4197 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4198 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4199 int x, startx = span->startx, endx = span->endx;
4200 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4201 float CubeVectordata[4];
4202 float CubeVectorslope[4];
4203 float LightVectordata[4];
4204 float LightVectorslope[4];
4205 float EyeVectordata[4];
4206 float EyeVectorslope[4];
4208 float diffusetex[4];
4210 float surfacenormal[4];
4211 float lightnormal[4];
4213 float specularnormal[4];
4216 float SpecularPower;
4217 float CubeVector[4];
4220 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4221 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4222 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4223 Color_Glow[3] = 0.0f;
4224 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4225 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4226 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4227 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4228 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4229 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4230 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4231 Color_Diffuse[3] = 0.0f;
4232 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4233 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4234 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4235 Color_Specular[3] = 0.0f;
4236 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4237 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4238 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4239 Color_Pants[3] = 0.0f;
4240 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4241 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4242 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4243 Color_Shirt[3] = 0.0f;
4244 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4245 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4246 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4247 LightColor[3] = 0.0f;
4248 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4249 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4250 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4251 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4252 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4253 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4254 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4255 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4257 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4258 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4260 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4261 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4262 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4264 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4265 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4266 for (x = startx;x < endx;x++)
4269 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4270 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4271 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4272 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4273 if (attenuation < 0.01f)
4275 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4277 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4278 if (attenuation < 0.01f)
4282 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4283 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4284 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4285 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4286 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4288 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4289 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4290 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4291 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4293 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4294 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4295 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4296 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4297 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4298 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4299 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4300 DPSOFTRAST_Vector3Normalize(surfacenormal);
4302 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4303 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4304 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4305 DPSOFTRAST_Vector3Normalize(lightnormal);
4307 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4309 if(thread->shader_exactspecularmath)
4311 // reflect lightnormal at surfacenormal, take the negative of that
4312 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4314 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4315 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4316 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4317 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4319 // dot of this and normalize(EyeVectorFogDepth.xyz)
4320 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4321 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4322 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4323 DPSOFTRAST_Vector3Normalize(eyenormal);
4325 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4329 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4330 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4331 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4332 DPSOFTRAST_Vector3Normalize(eyenormal);
4334 specularnormal[0] = lightnormal[0] + eyenormal[0];
4335 specularnormal[1] = lightnormal[1] + eyenormal[1];
4336 specularnormal[2] = lightnormal[2] + eyenormal[2];
4337 DPSOFTRAST_Vector3Normalize(specularnormal);
4339 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4341 specular = pow(specular, SpecularPower * glosstex[3]);
4343 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4345 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4346 attenuation *= (1.0f / 255.0f);
4347 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4348 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4349 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4350 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4354 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4355 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4356 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4357 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4359 buffer_FragColorbgra8[x*4+0] = d[0];
4360 buffer_FragColorbgra8[x*4+1] = d[1];
4361 buffer_FragColorbgra8[x*4+2] = d[2];
4362 buffer_FragColorbgra8[x*4+3] = d[3];
4365 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4367 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4368 for (x = startx;x < endx;x++)
4371 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4372 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4373 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4374 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4375 if (attenuation < 0.01f)
4377 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4379 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4380 if (attenuation < 0.01f)
4384 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4385 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4386 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4387 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4388 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4390 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4391 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4392 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4393 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4395 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4396 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4397 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4398 DPSOFTRAST_Vector3Normalize(surfacenormal);
4400 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4401 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4402 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4403 DPSOFTRAST_Vector3Normalize(lightnormal);
4405 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4406 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4408 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4409 attenuation *= (1.0f / 255.0f);
4410 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4411 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4412 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4413 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4417 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4418 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4419 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4420 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4422 buffer_FragColorbgra8[x*4+0] = d[0];
4423 buffer_FragColorbgra8[x*4+1] = d[1];
4424 buffer_FragColorbgra8[x*4+2] = d[2];
4425 buffer_FragColorbgra8[x*4+3] = d[3];
4430 for (x = startx;x < endx;x++)
4433 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4434 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4435 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4436 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4437 if (attenuation < 0.01f)
4439 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4441 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4442 if (attenuation < 0.01f)
4446 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4447 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4448 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4449 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4450 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4452 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4453 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4454 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4455 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4457 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4459 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4460 attenuation *= (1.0f / 255.0f);
4461 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4462 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4463 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4464 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4468 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4469 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4470 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4471 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4473 buffer_FragColorbgra8[x*4+0] = d[0];
4474 buffer_FragColorbgra8[x*4+1] = d[1];
4475 buffer_FragColorbgra8[x*4+2] = d[2];
4476 buffer_FragColorbgra8[x*4+3] = d[3];
4479 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4485 void DPSOFTRAST_VertexShader_Refraction(void)
4487 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4488 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4489 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4492 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4494 // DIRTY TRICK: only do sideways displacement. Not correct, but cheaper and thus better for SW.
4496 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4498 int x, startx = span->startx, endx = span->endx;
4501 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4502 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4505 float ModelViewProjectionPositiondata[4];
4506 float ModelViewProjectionPositionslope[4];
4509 float ScreenScaleRefractReflect[2];
4510 float ScreenCenterRefractReflect[2];
4511 float DistortScaleRefractReflect[2];
4512 float RefractColor[4];
4514 const unsigned char * RESTRICT pixelbase;
4515 const unsigned char * RESTRICT pixel[4];
4516 DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4517 if(!texture) return;
4518 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[0][0];
4521 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4522 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4525 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
4528 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4529 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4530 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4531 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4532 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4533 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4534 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4535 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4536 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4537 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4540 for (x = startx;x < endx;x++)
4542 float SafeScreenTexCoord[2];
4543 float ScreenTexCoord[2];
4550 // " vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4551 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4553 // " vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4554 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4555 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4557 // " vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4558 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4559 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4560 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4561 DPSOFTRAST_Vector3Normalize(v);
4562 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4563 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4565 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4566 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4568 unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<12) - 2048, ScreenTexCoord[1] * (texture->mipmap[0][3]<<12) - 2048};
4569 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
4570 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
4571 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
4572 int tci[2] = { tc[0]>>12, tc[1]>>12 };
4573 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4574 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4575 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4576 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4577 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4578 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4579 pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci1[0]);
4580 pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci[0]);
4581 pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci1[0]);
4582 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
4583 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
4584 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
4588 int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2], ScreenTexCoord[1] * texture->mipmap[0][3] };
4589 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4590 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4591 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4597 //p = (int) bound(startx, x + (ScreenTexCoord[0] - SafeScreenTexCoord[0]) / (ModelViewProjectionPositionslope[0]*z), endx-1);
4598 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4599 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4600 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4601 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4604 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4609 void DPSOFTRAST_VertexShader_Water(void)
4611 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4615 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4618 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4619 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4620 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4621 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4622 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4627 void DPSOFTRAST_VertexShader_ShowDepth(void)
4629 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4632 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4635 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4636 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4637 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4638 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4639 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4644 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4646 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4649 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4652 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4653 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4654 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4655 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4656 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4661 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4663 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4666 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4669 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4670 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4671 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4672 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4673 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4678 typedef struct DPSOFTRAST_ShaderModeInfo_s
4681 void (*Vertex)(void);
4682 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4683 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4684 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4686 DPSOFTRAST_ShaderModeInfo;
4688 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4690 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4691 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4692 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4693 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4694 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4695 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4696 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4697 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4698 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4699 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4700 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4701 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4702 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4703 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4704 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4705 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4708 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4715 // unsigned int *colorpixel;
4716 unsigned int *depthpixel;
4722 DPSOFTRAST_State_Triangle *triangle;
4723 DPSOFTRAST_State_Span *span;
4724 unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
4725 for (i = 0; i < thread->numspans; i++)
4727 span = &thread->spans[i];
4728 triangle = &thread->triangles[span->triangle];
4729 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4731 wslope = triangle->w[0];
4732 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4733 depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4734 depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4735 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4736 startx = span->startx;
4738 switch(thread->fb_depthfunc)
4741 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4742 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4743 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4744 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4745 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4746 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4747 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4749 //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4750 //for (x = startx;x < endx;x++)
4751 // colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4752 // if there is no color buffer, skip pixel shader
4753 while (startx < endx && !pixelmask[startx])
4755 while (endx > startx && !pixelmask[endx-1])
4758 continue; // no pixels to fill
4759 span->pixelmask = pixelmask;
4760 span->startx = startx;
4762 // run pixel shader if appropriate
4763 // do this before running depthmask code, to allow the pixelshader
4764 // to clear pixelmask values for alpha testing
4765 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4766 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4767 if (thread->depthmask)
4768 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4774 // no depth testing means we're just dealing with color...
4775 // if there is no color buffer, skip pixel shader
4776 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4778 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4779 span->pixelmask = pixelmask;
4780 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4784 thread->numspans = 0;
4787 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4789 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4792 int cullface = thread->cullface;
4793 int minx, maxx, miny, maxy;
4794 int miny1, maxy1, miny2, maxy2;
4795 __m128i fbmin, fbmax;
4796 __m128 viewportcenter, viewportscale;
4797 int firstvertex = command->firstvertex;
4798 int numvertices = command->numvertices;
4799 int numtriangles = command->numtriangles;
4800 const int *element3i = command->element3i;
4801 const unsigned short *element3s = command->element3s;
4802 int clipped = command->clipped;
4809 int starty, endy, bandy;
4813 float clip0origin, clip0slope;
4815 __m128 triangleedge1, triangleedge2, trianglenormal;
4818 DPSOFTRAST_State_Triangle *triangle;
4819 DPSOFTRAST_Texture *texture;
4820 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4821 miny = thread->fb_scissor[1];
4822 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4823 miny1 = bound(miny, thread->miny1, maxy);
4824 maxy1 = bound(miny, thread->maxy1, maxy);
4825 miny2 = bound(miny, thread->miny2, maxy);
4826 maxy2 = bound(miny, thread->maxy2, maxy);
4827 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4829 if (!ATOMIC_DECREMENT(command->refcount))
4831 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4832 MM_FREE(command->arrays);
4836 minx = thread->fb_scissor[0];
4837 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4838 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4839 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4840 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4841 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4842 screen[3] = _mm_setzero_ps();
4843 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4844 for (i = 0;i < numtriangles;i++)
4846 const float *screencoord4f = command->arrays;
4847 const float *arrays = screencoord4f + numvertices*4;
4849 // generate the 3 edges of this triangle
4850 // generate spans for the triangle - switch based on left split or right split classification of triangle
4853 e[0] = element3s[i*3+0] - firstvertex;
4854 e[1] = element3s[i*3+1] - firstvertex;
4855 e[2] = element3s[i*3+2] - firstvertex;
4859 e[0] = element3i[i*3+0] - firstvertex;
4860 e[1] = element3i[i*3+1] - firstvertex;
4861 e[2] = element3i[i*3+2] - firstvertex;
4870 #define SKIPBACKFACE \
4871 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4872 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4873 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4874 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4875 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4879 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4883 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4888 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4889 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4891 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4892 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4894 #define CLIPPEDVERTEXCOPY(k,p1) \
4895 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4897 #define GENATTRIBCOPY(attrib, p1) \
4898 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4899 #define GENATTRIBLERP(attrib, p1, p2) \
4901 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4902 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4904 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4908 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4909 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4910 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4911 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4912 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4913 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4914 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4920 // calculate distance from nearplane
4921 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4922 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4923 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4924 if (clipdist[0] >= 0.0f)
4926 if (clipdist[1] >= 0.0f)
4928 if (clipdist[2] >= 0.0f)
4931 // triangle is entirely in front of nearplane
4932 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4939 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4947 if (clipdist[2] >= 0.0f)
4949 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4956 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4963 else if (clipdist[1] >= 0.0f)
4965 if (clipdist[2] >= 0.0f)
4967 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4974 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4980 else if (clipdist[2] >= 0.0f)
4982 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4987 else continue; // triangle is entirely behind nearplane
4990 // calculate integer y coords for triangle points
4991 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4992 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4993 screenmin = _mm_min_epi16(screeni, screenir),
4994 screenmax = _mm_max_epi16(screeni, screenir);
4995 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4996 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4997 screenmin = _mm_max_epi16(screenmin, fbmin);
4998 screenmax = _mm_min_epi16(screenmax, fbmax);
4999 // skip offscreen triangles
5000 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5002 starty = _mm_extract_epi16(screenmin, 1);
5003 endy = _mm_extract_epi16(screenmax, 1)+1;
5004 if (starty >= maxy1 && endy <= miny2)
5006 screeny = _mm_srai_epi32(screeni, 16);
5009 triangle = &thread->triangles[thread->numtriangles];
5011 // calculate attribute plans for triangle data...
5012 // okay, this triangle is going to produce spans, we'd better project
5013 // the interpolants now (this is what gives perspective texturing),
5014 // this consists of simply multiplying all arrays by the W coord
5015 // (which is basically 1/Z), which will be undone per-pixel
5016 // (multiplying by Z again) to get the perspective-correct array
5019 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5020 __m128 mipedgescale, mipdensity;
5021 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5022 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5023 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5024 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5025 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5026 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5027 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5028 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5029 attribedge1 = _mm_sub_ss(w0, w1);
5030 attribedge2 = _mm_sub_ss(w2, w1);
5031 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5032 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5033 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5034 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5035 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5036 _mm_store_ss(&triangle->w[0], attribxslope);
5037 _mm_store_ss(&triangle->w[1], attribyslope);
5038 _mm_store_ss(&triangle->w[2], attriborigin);
5043 if(thread->clipplane[0] || thread->clipplane[1] || thread->clipplane[2])
5045 float cliporigin, clipxslope, clipyslope;
5046 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5047 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5048 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5049 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5050 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5051 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5052 cliporigin = _mm_cvtss_f32(attriborigin)*thread->clipplane[2] + thread->clipplane[3];
5053 clipxslope = thread->clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->clipplane[2];
5054 clipyslope = thread->clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->clipplane[2];
5057 clip0origin = -cliporigin/clipxslope;
5058 clip0slope = -clipyslope/clipxslope;
5059 clip0dir = clipxslope > 0 ? 1 : -1;
5061 else if(clipyslope > 0)
5063 clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5064 clip0slope = dpsoftrast.fb_width;
5067 else if(clipyslope < 0)
5069 clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5070 clip0slope = -dpsoftrast.fb_width;
5073 else if(clip0origin < 0) continue;
5076 mipedgescale = _mm_setzero_ps();
5077 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5079 __m128 attrib0, attrib1, attrib2;
5080 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5081 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5083 arrays += numvertices*4;
5084 GENATTRIBS(attrib0, attrib1, attrib2);
5085 attriborigin = _mm_mul_ps(attrib1, w1);
5086 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5087 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5088 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5089 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5090 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5091 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5092 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5093 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5094 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5096 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5097 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5098 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5099 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5103 memset(triangle->mip, 0, sizeof(triangle->mip));
5104 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5106 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5107 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5109 texture = thread->texbound[texunit];
5110 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5112 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5113 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5114 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5115 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5116 // this will be multiplied in the texturing routine by the texture resolution
5117 y = _mm_cvtss_si32(mipdensity);
5120 y = (int)(log((float)y)*0.5f/M_LN2);
5121 if (y > texture->mipmaps - 1)
5122 y = texture->mipmaps - 1;
5123 triangle->mip[texunit] = y;
5129 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5132 __m128 xcoords, xslope;
5133 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5134 int yccmask = _mm_movemask_epi8(ycc);
5135 int edge0p, edge0n, edge1p, edge1n;
5143 case 0xFFFF: /*0000*/ y = endy; continue;
5144 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5145 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5146 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5147 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5148 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5149 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5150 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5151 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5152 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5153 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5154 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5155 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5156 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5157 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5158 case 0x0000: /*1111*/ y++; continue;
5166 case 0xFFFF: /*000*/ y = endy; continue;
5167 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5168 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5169 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5170 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5171 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5172 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5173 case 0x0000: /*111*/ y++; continue;
5176 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5177 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5178 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5179 nexty = _mm_extract_epi16(ycc, 0);
5180 if (nexty >= bandy) nexty = bandy-1;
5181 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5182 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5183 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5184 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5185 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5186 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5188 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5189 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5191 clip0 = clip0origin + (y+0.5f)*clip0slope;
5192 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5194 int startx, endx, clipx = minx, offset;
5195 startx = _mm_cvtss_si32(xcoords);
5196 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5199 if (startx < 0) startx = 0;
5200 startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
5202 if (endx > maxx) endx = maxx;
5203 if (startx >= endx) continue;
5211 if(endx <= clip0) continue;
5212 clipx = max((int)clip0, minx);
5213 startx += (clipx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
5216 else if (endx > clip0)
5218 if(startx >= clip0) continue;
5223 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5225 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5226 span->triangle = thread->numtriangles;
5229 span->startx = max(clipx - offset, 0);
5230 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5231 if (span->startx >= span->endx)
5233 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5234 DPSOFTRAST_Draw_ProcessSpans(thread);
5239 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5241 DPSOFTRAST_Draw_ProcessSpans(thread);
5242 thread->numtriangles = 0;
5246 if (!ATOMIC_DECREMENT(command->refcount))
5248 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5249 MM_FREE(command->arrays);
5252 if (thread->numspans > 0 || thread->numtriangles > 0)
5254 DPSOFTRAST_Draw_ProcessSpans(thread);
5255 thread->numtriangles = 0;
5260 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5264 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5265 int datasize = 2*numvertices*sizeof(float[4]);
5266 DPSOFTRAST_Command_Draw *command;
5267 unsigned char *data;
5268 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5270 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5271 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5273 datasize += numvertices*sizeof(float[4]);
5276 datasize += numtriangles*sizeof(unsigned short[3]);
5278 datasize += numtriangles*sizeof(int[3]);
5279 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5280 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5282 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5283 data = (unsigned char *)MM_CALLOC(datasize, 1);
5287 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5288 data = (unsigned char *)command + commandsize;
5290 command->firstvertex = firstvertex;
5291 command->numvertices = numvertices;
5292 command->numtriangles = numtriangles;
5293 command->arrays = (float *)data;
5294 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5295 dpsoftrast.firstvertex = firstvertex;
5296 dpsoftrast.numvertices = numvertices;
5297 dpsoftrast.screencoord4f = (float *)data;
5298 data += numvertices*sizeof(float[4]);
5299 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5300 data += numvertices*sizeof(float[4]);
5301 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5303 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5304 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5306 dpsoftrast.post_array4f[j] = (float *)data;
5307 data += numvertices*sizeof(float[4]);
5309 command->element3i = NULL;
5310 command->element3s = NULL;
5313 command->element3s = (unsigned short *)data;
5314 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5318 command->element3i = (int *)data;
5319 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5324 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5326 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5327 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5328 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5329 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5330 if (command->starty >= command->endy)
5332 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5333 MM_FREE(command->arrays);
5334 DPSOFTRAST_UndoCommand(command->commandsize);
5337 command->clipped = dpsoftrast.drawclipped;
5338 command->refcount = dpsoftrast.numthreads;
5340 if (dpsoftrast.usethreads)
5343 DPSOFTRAST_Draw_SyncCommands();
5344 for (i = 0; i < dpsoftrast.numthreads; i++)
5346 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5347 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5348 Thread_CondSignal(thread->drawcond);
5353 DPSOFTRAST_Draw_FlushThreads();
5357 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5358 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5360 thread->validate |= DPSOFTRAST_VALIDATE_FB;
5362 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5364 DPSOFTRAST_Command_SetRenderTargets *command;
5365 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5366 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5367 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5369 dpsoftrast.fb_width = width;
5370 dpsoftrast.fb_height = height;
5371 dpsoftrast.fb_depthpixels = depthpixels;
5372 dpsoftrast.fb_colorpixels[0] = colorpixels0;
5373 dpsoftrast.fb_colorpixels[1] = colorpixels1;
5374 dpsoftrast.fb_colorpixels[2] = colorpixels2;
5375 dpsoftrast.fb_colorpixels[3] = colorpixels3;
5376 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5377 command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5378 command->width = width;
5379 command->height = height;
5382 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5384 int commandoffset = thread->commandoffset;
5385 while (commandoffset != endoffset)
5387 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5388 switch (command->opcode)
5390 #define INTERPCOMMAND(name) \
5391 case DPSOFTRAST_OPCODE_##name : \
5392 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5393 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5394 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5395 commandoffset = 0; \
5397 INTERPCOMMAND(Viewport)
5398 INTERPCOMMAND(ClearColor)
5399 INTERPCOMMAND(ClearDepth)
5400 INTERPCOMMAND(ColorMask)
5401 INTERPCOMMAND(DepthTest)
5402 INTERPCOMMAND(ScissorTest)
5403 INTERPCOMMAND(Scissor)
5404 INTERPCOMMAND(BlendFunc)
5405 INTERPCOMMAND(BlendSubtract)
5406 INTERPCOMMAND(DepthMask)
5407 INTERPCOMMAND(DepthFunc)
5408 INTERPCOMMAND(DepthRange)
5409 INTERPCOMMAND(PolygonOffset)
5410 INTERPCOMMAND(CullFace)
5411 INTERPCOMMAND(AlphaTest)
5412 INTERPCOMMAND(AlphaFunc)
5413 INTERPCOMMAND(SetTexture)
5414 INTERPCOMMAND(SetShader)
5415 INTERPCOMMAND(Uniform4f)
5416 INTERPCOMMAND(UniformMatrix4f)
5417 INTERPCOMMAND(Uniform1i)
5418 INTERPCOMMAND(SetRenderTargets)
5419 INTERPCOMMAND(ClipPlane)
5421 case DPSOFTRAST_OPCODE_Draw:
5422 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5423 commandoffset += command->commandsize;
5424 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5426 thread->commandoffset = commandoffset;
5429 case DPSOFTRAST_OPCODE_Reset:
5434 thread->commandoffset = commandoffset;
5437 static int DPSOFTRAST_Draw_Thread(void *data)
5439 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5440 while(thread->index >= 0)
5442 if (thread->commandoffset != dpsoftrast.drawcommand)
5444 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5448 Thread_LockMutex(thread->drawmutex);
5449 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5451 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5452 thread->starving = true;
5453 Thread_CondWait(thread->drawcond, thread->drawmutex);
5454 thread->starving = false;
5456 Thread_UnlockMutex(thread->drawmutex);
5462 static void DPSOFTRAST_Draw_FlushThreads(void)
5464 DPSOFTRAST_State_Thread *thread;
5466 DPSOFTRAST_Draw_SyncCommands();
5467 if (dpsoftrast.usethreads)
5469 for (i = 0; i < dpsoftrast.numthreads; i++)
5471 thread = &dpsoftrast.threads[i];
5472 if (thread->commandoffset != dpsoftrast.drawcommand)
5474 Thread_LockMutex(thread->drawmutex);
5475 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5476 Thread_CondSignal(thread->drawcond);
5477 Thread_UnlockMutex(thread->drawmutex);
5480 for (i = 0; i < dpsoftrast.numthreads; i++)
5482 thread = &dpsoftrast.threads[i];
5483 if (thread->commandoffset != dpsoftrast.drawcommand)
5485 Thread_LockMutex(thread->drawmutex);
5486 if (thread->commandoffset != dpsoftrast.drawcommand)
5488 thread->waiting = true;
5489 Thread_CondWait(thread->waitcond, thread->drawmutex);
5490 thread->waiting = false;
5492 Thread_UnlockMutex(thread->drawmutex);
5498 for (i = 0; i < dpsoftrast.numthreads; i++)
5500 thread = &dpsoftrast.threads[i];
5501 if (thread->commandoffset != dpsoftrast.drawcommand)
5502 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5505 dpsoftrast.commandpool.usedcommands = 0;
5508 void DPSOFTRAST_Flush(void)
5510 DPSOFTRAST_Draw_FlushThreads();
5513 void DPSOFTRAST_Finish(void)
5518 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5528 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5529 dpsoftrast.bigendian = u.b[3];
5530 dpsoftrast.fb_width = width;
5531 dpsoftrast.fb_height = height;
5532 dpsoftrast.fb_depthpixels = depthpixels;
5533 dpsoftrast.fb_colorpixels[0] = colorpixels;
5534 dpsoftrast.fb_colorpixels[1] = NULL;
5535 dpsoftrast.fb_colorpixels[1] = NULL;
5536 dpsoftrast.fb_colorpixels[1] = NULL;
5537 dpsoftrast.viewport[0] = 0;
5538 dpsoftrast.viewport[1] = 0;
5539 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5540 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5541 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5542 dpsoftrast.texture_firstfree = 1;
5543 dpsoftrast.texture_end = 1;
5544 dpsoftrast.texture_max = 0;
5545 dpsoftrast.color[0] = 1;
5546 dpsoftrast.color[1] = 1;
5547 dpsoftrast.color[2] = 1;
5548 dpsoftrast.color[3] = 1;
5549 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5550 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5551 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5552 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5553 for (i = 0; i < dpsoftrast.numthreads; i++)
5555 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5557 thread->cullface = GL_BACK;
5558 thread->colormask[1] = 1;
5559 thread->colormask[2] = 1;
5560 thread->colormask[3] = 1;
5561 thread->blendfunc[0] = GL_ONE;
5562 thread->blendfunc[1] = GL_ZERO;
5563 thread->depthmask = true;
5564 thread->depthtest = true;
5565 thread->depthfunc = GL_LEQUAL;
5566 thread->scissortest = false;
5567 thread->alphatest = false;
5568 thread->alphafunc = GL_GREATER;
5569 thread->alphavalue = 0.5f;
5570 thread->viewport[0] = 0;
5571 thread->viewport[1] = 0;
5572 thread->viewport[2] = dpsoftrast.fb_width;
5573 thread->viewport[3] = dpsoftrast.fb_height;
5574 thread->scissor[0] = 0;
5575 thread->scissor[1] = 0;
5576 thread->scissor[2] = dpsoftrast.fb_width;
5577 thread->scissor[3] = dpsoftrast.fb_height;
5578 thread->depthrange[0] = 0;
5579 thread->depthrange[1] = 1;
5580 thread->polygonoffset[0] = 0;
5581 thread->polygonoffset[1] = 0;
5582 thread->clipplane[0] = 0;
5583 thread->clipplane[1] = 0;
5584 thread->clipplane[2] = 0;
5585 thread->clipplane[3] = 1;
5587 DPSOFTRAST_RecalcThread(thread);
5589 thread->numspans = 0;
5590 thread->numtriangles = 0;
5591 thread->commandoffset = 0;
5592 thread->waiting = false;
5593 thread->starving = false;
5595 thread->validate = -1;
5596 DPSOFTRAST_Validate(thread, -1);
5598 if (dpsoftrast.usethreads)
5600 thread->waitcond = Thread_CreateCond();
5601 thread->drawcond = Thread_CreateCond();
5602 thread->drawmutex = Thread_CreateMutex();
5603 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5609 void DPSOFTRAST_Shutdown(void)
5612 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5614 DPSOFTRAST_State_Thread *thread;
5615 for (i = 0; i < dpsoftrast.numthreads; i++)
5617 thread = &dpsoftrast.threads[i];
5618 Thread_LockMutex(thread->drawmutex);
5620 Thread_CondSignal(thread->drawcond);
5621 Thread_UnlockMutex(thread->drawmutex);
5622 Thread_WaitThread(thread->thread, 0);
5623 Thread_DestroyCond(thread->waitcond);
5624 Thread_DestroyCond(thread->drawcond);
5625 Thread_DestroyMutex(thread->drawmutex);
5628 for (i = 0;i < dpsoftrast.texture_end;i++)
5629 if (dpsoftrast.texture[i].bytes)
5630 MM_FREE(dpsoftrast.texture[i].bytes);
5631 if (dpsoftrast.texture)
5632 free(dpsoftrast.texture);
5633 if (dpsoftrast.threads)
5634 MM_FREE(dpsoftrast.threads);
5635 memset(&dpsoftrast, 0, sizeof(dpsoftrast));