3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 #pragma warning(disable : 4324)
14 typedef qboolean bool;
18 #define ATOMIC_SIZE 32
21 #if defined(__APPLE__)
22 #include <libkern/OSAtomic.h>
23 #define ALIGN(var) var __attribute__((__aligned__(16)))
24 #define ATOMIC(var) var __attribute__((__aligned__(32)))
25 #define MEMORY_BARRIER (_mm_sfence())
26 #define ATOMIC_COUNTER volatile int32_t
27 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30 #elif defined(__GNUC__)
31 #define ALIGN(var) var __attribute__((__aligned__(16)))
32 #define ATOMIC(var) var __attribute__((__aligned__(32)))
33 #define MEMORY_BARRIER (_mm_sfence())
34 //(__sync_synchronize())
35 #define ATOMIC_COUNTER volatile int
36 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
37 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
38 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
39 #elif defined(_MSC_VER)
40 #define ALIGN(var) __declspec(align(16)) var
41 #define ATOMIC(var) __declspec(align(32)) var
42 #define MEMORY_BARRIER (_mm_sfence())
44 #define ATOMIC_COUNTER volatile LONG
45 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
46 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
47 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
52 #define ALIGN(var) var
55 #define ATOMIC(var) var
57 #ifndef MEMORY_BARRIER
58 #define MEMORY_BARRIER ((void)0)
60 #ifndef ATOMIC_COUNTER
61 #define ATOMIC_COUNTER int
63 #ifndef ATOMIC_INCREMENT
64 #define ATOMIC_INCREMENT(counter) (++(counter))
66 #ifndef ATOMIC_DECREMENT
67 #define ATOMIC_DECREMENT(counter) (--(counter))
70 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
74 #include <emmintrin.h>
76 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
78 static void *MM_CALLOC(size_t nmemb, size_t size)
80 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
81 if (ptr != NULL) memset(ptr, 0, nmemb*size);
85 #define MM_FREE _mm_free
87 #define MM_MALLOC(size) malloc(size)
88 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
92 typedef enum DPSOFTRAST_ARRAY_e
94 DPSOFTRAST_ARRAY_POSITION,
95 DPSOFTRAST_ARRAY_COLOR,
96 DPSOFTRAST_ARRAY_TEXCOORD0,
97 DPSOFTRAST_ARRAY_TEXCOORD1,
98 DPSOFTRAST_ARRAY_TEXCOORD2,
99 DPSOFTRAST_ARRAY_TEXCOORD3,
100 DPSOFTRAST_ARRAY_TEXCOORD4,
101 DPSOFTRAST_ARRAY_TEXCOORD5,
102 DPSOFTRAST_ARRAY_TEXCOORD6,
103 DPSOFTRAST_ARRAY_TEXCOORD7,
104 DPSOFTRAST_ARRAY_TOTAL
108 typedef struct DPSOFTRAST_Texture_s
115 DPSOFTRAST_TEXTURE_FILTER filter;
118 ATOMIC_COUNTER binds;
119 unsigned char *bytes;
120 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
124 #define COMMAND_SIZE ALIGN_SIZE
125 #define COMMAND_ALIGN(var) ALIGN(var)
127 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
129 unsigned char opcode;
130 unsigned short commandsize;
134 enum { DPSOFTRAST_OPCODE_Reset = 0 };
136 #define DEFCOMMAND(opcodeval, name, fields) \
137 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
138 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
140 unsigned char opcode; \
141 unsigned short commandsize; \
143 } DPSOFTRAST_Command_##name );
145 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
146 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
148 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
152 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
154 DPSOFTRAST_State_Command_Pool);
156 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
158 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
160 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
162 DPSOFTRAST_State_Triangle);
164 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
165 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
166 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
167 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
168 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
170 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
171 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
172 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
173 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
174 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
175 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
176 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
177 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
178 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
181 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
183 typedef ALIGN(struct DPSOFTRAST_State_Span_s
185 int triangle; // triangle this span was generated by
186 int x; // framebuffer x coord
187 int y; // framebuffer y coord
188 int startx; // usable range (according to pixelmask)
189 int endx; // usable range (according to pixelmask)
190 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
192 DPSOFTRAST_State_Span);
194 #define DPSOFTRAST_DRAW_MAXSPANS 1024
195 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
197 #define DPSOFTRAST_VALIDATE_FB 1
198 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
199 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
200 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
202 typedef enum DPSOFTRAST_BLENDMODE_e
204 DPSOFTRAST_BLENDMODE_OPAQUE,
205 DPSOFTRAST_BLENDMODE_ALPHA,
206 DPSOFTRAST_BLENDMODE_ADDALPHA,
207 DPSOFTRAST_BLENDMODE_ADD,
208 DPSOFTRAST_BLENDMODE_INVMOD,
209 DPSOFTRAST_BLENDMODE_MUL,
210 DPSOFTRAST_BLENDMODE_MUL2,
211 DPSOFTRAST_BLENDMODE_SUBALPHA,
212 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
213 DPSOFTRAST_BLENDMODE_INVADD,
214 DPSOFTRAST_BLENDMODE_TOTAL
216 DPSOFTRAST_BLENDMODE;
218 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
237 float polygonoffset[2];
238 ALIGN(float clipplane[4]);
241 int shader_permutation;
242 int shader_exactspecularmath;
244 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
246 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
247 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
249 // DPSOFTRAST_VALIDATE_ flags
252 // derived values (DPSOFTRAST_VALIDATE_FB)
255 ALIGN(float fb_viewportcenter[4]);
256 ALIGN(float fb_viewportscale[4]);
258 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
261 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
270 ATOMIC(volatile int commandoffset);
272 volatile bool waiting;
273 volatile bool starving;
280 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
281 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
283 DPSOFTRAST_State_Thread);
285 typedef ATOMIC(struct DPSOFTRAST_State_s
289 unsigned int *fb_depthpixels;
290 unsigned int *fb_colorpixels[4];
293 ALIGN(float fb_viewportcenter[4]);
294 ALIGN(float fb_viewportscale[4]);
297 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
298 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
300 const float *pointer_vertex3f;
301 const float *pointer_color4f;
302 const unsigned char *pointer_color4ub;
303 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
306 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
307 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
308 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
312 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
313 float *screencoord4f;
319 int shader_permutation;
320 int shader_exactspecularmath;
324 int texture_firstfree;
325 DPSOFTRAST_Texture *texture;
330 const char *errorstring;
335 DPSOFTRAST_State_Thread *threads;
337 ATOMIC(volatile int drawcommand);
339 DPSOFTRAST_State_Command_Pool commandpool;
343 DPSOFTRAST_State dpsoftrast;
345 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
346 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
347 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
348 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
349 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
351 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
353 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
354 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
355 fb_viewportcenter[3] = 0.5f;
356 fb_viewportcenter[0] = 0.0f;
357 fb_viewportscale[1] = 0.5f * viewport[2];
358 fb_viewportscale[2] = -0.5f * viewport[3];
359 fb_viewportscale[3] = 0.5f;
360 fb_viewportscale[0] = 1.0f;
363 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
365 if (dpsoftrast.interlace)
367 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
368 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
369 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
370 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
374 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
375 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
379 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
381 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
382 // and viewport projection values
385 x1 = thread->scissor[0];
386 x2 = thread->scissor[0] + thread->scissor[2];
387 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
388 y2 = dpsoftrast.fb_height - thread->scissor[1];
389 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
391 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
393 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
394 thread->fb_scissor[0] = x1;
395 thread->fb_scissor[1] = y1;
396 thread->fb_scissor[2] = x2 - x1;
397 thread->fb_scissor[3] = y2 - y1;
399 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
400 DPSOFTRAST_RecalcThread(thread);
403 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
405 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
408 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
410 if (thread->blendsubtract)
412 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
414 #define BLENDFUNC(sfactor, dfactor, blendmode) \
415 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
416 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
417 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
422 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
424 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
425 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
426 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
427 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
428 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
429 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
430 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
431 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
432 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
433 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
434 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
439 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
441 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
443 mask &= thread->validate;
446 if (mask & DPSOFTRAST_VALIDATE_FB)
448 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
449 DPSOFTRAST_RecalcFB(thread);
451 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
453 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
454 DPSOFTRAST_RecalcDepthFunc(thread);
456 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
458 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
459 DPSOFTRAST_RecalcBlendFunc(thread);
463 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
465 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
466 return &dpsoftrast.texture[index];
470 static void DPSOFTRAST_Texture_Grow(void)
472 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
473 DPSOFTRAST_State_Thread *thread;
477 // expand texture array as needed
478 if (dpsoftrast.texture_max < 1024)
479 dpsoftrast.texture_max = 1024;
481 dpsoftrast.texture_max *= 2;
482 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
483 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
484 if (dpsoftrast.texbound[i])
485 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
486 for (j = 0; j < dpsoftrast.numthreads; j++)
488 thread = &dpsoftrast.threads[j];
489 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
490 if (thread->texbound[i])
491 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
495 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
504 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
505 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
506 DPSOFTRAST_Texture *texture;
507 if (width*height*depth < 1)
509 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
512 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
514 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
519 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
520 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
521 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
523 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
524 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
526 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
531 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
534 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
536 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
541 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
543 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
546 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
548 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
551 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
553 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
556 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
558 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
561 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
563 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
566 // find first empty slot in texture array
567 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
568 if (!dpsoftrast.texture[texnum].bytes)
570 dpsoftrast.texture_firstfree = texnum + 1;
571 if (dpsoftrast.texture_max <= texnum)
572 DPSOFTRAST_Texture_Grow();
573 if (dpsoftrast.texture_end <= texnum)
574 dpsoftrast.texture_end = texnum + 1;
575 texture = &dpsoftrast.texture[texnum];
576 memset(texture, 0, sizeof(*texture));
577 texture->flags = flags;
578 texture->width = width;
579 texture->height = height;
580 texture->depth = depth;
581 texture->sides = sides;
593 s = w * h * d * sides * 4;
594 texture->mipmap[mipmaps][0] = size;
595 texture->mipmap[mipmaps][1] = s;
596 texture->mipmap[mipmaps][2] = w;
597 texture->mipmap[mipmaps][3] = h;
598 texture->mipmap[mipmaps][4] = d;
601 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
607 texture->mipmaps = mipmaps;
608 texture->size = size;
610 // allocate the pixels now
611 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
615 void DPSOFTRAST_Texture_Free(int index)
617 DPSOFTRAST_Texture *texture;
618 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
622 MM_FREE(texture->bytes);
623 texture->bytes = NULL;
624 memset(texture, 0, sizeof(*texture));
625 // adjust the free range and used range
626 if (dpsoftrast.texture_firstfree > index)
627 dpsoftrast.texture_firstfree = index;
628 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
629 dpsoftrast.texture_end--;
631 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
633 int i, x, y, z, w, layer0, layer1, row0, row1;
634 unsigned char *o, *i0, *i1, *i2, *i3;
635 DPSOFTRAST_Texture *texture;
636 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
637 if (texture->mipmaps <= 1)
639 for (i = 1;i < texture->mipmaps;i++)
641 for (z = 0;z < texture->mipmap[i][4];z++)
645 if (layer1 >= texture->mipmap[i-1][4])
646 layer1 = texture->mipmap[i-1][4]-1;
647 for (y = 0;y < texture->mipmap[i][3];y++)
651 if (row1 >= texture->mipmap[i-1][3])
652 row1 = texture->mipmap[i-1][3]-1;
653 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
654 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
655 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
656 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
657 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
658 w = texture->mipmap[i][2];
661 if (texture->mipmap[i-1][2] > 1)
663 // average 3D texture
664 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
666 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
667 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
668 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
669 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
674 // average 3D mipmap with parent width == 1
675 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
677 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
678 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
679 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
680 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
686 if (texture->mipmap[i-1][2] > 1)
688 // average 2D texture (common case)
689 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
691 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
692 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
693 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
694 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
699 // 2D texture with parent width == 1
700 o[0] = (i0[0] + i1[0] + 1) >> 1;
701 o[1] = (i0[1] + i1[1] + 1) >> 1;
702 o[2] = (i0[2] + i1[2] + 1) >> 1;
703 o[3] = (i0[3] + i1[3] + 1) >> 1;
710 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
712 DPSOFTRAST_Texture *texture;
714 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
719 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
720 while (blockheight > 0)
722 memcpy(dst, pixels, blockwidth * 4);
723 pixels += blockwidth * 4;
724 dst += texture->mipmap[0][2] * 4;
728 DPSOFTRAST_Texture_CalculateMipmaps(index);
730 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
732 DPSOFTRAST_Texture *texture;
733 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
737 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
738 DPSOFTRAST_Texture_CalculateMipmaps(index);
740 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
742 DPSOFTRAST_Texture *texture;
743 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
744 return texture->mipmap[mip][2];
746 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
748 DPSOFTRAST_Texture *texture;
749 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
750 return texture->mipmap[mip][3];
752 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
754 DPSOFTRAST_Texture *texture;
755 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
756 return texture->mipmap[mip][4];
758 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
760 DPSOFTRAST_Texture *texture;
761 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
764 return texture->bytes + texture->mipmap[mip][0];
766 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
768 DPSOFTRAST_Texture *texture;
769 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
770 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
772 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
777 texture->filter = filter;
780 static void DPSOFTRAST_Draw_FlushThreads(void);
782 static void DPSOFTRAST_Draw_SyncCommands(void)
784 if(dpsoftrast.usethreads) MEMORY_BARRIER;
785 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
788 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
790 DPSOFTRAST_State_Thread *thread;
792 int freecommand = dpsoftrast.commandpool.freecommand;
793 int usedcommands = dpsoftrast.commandpool.usedcommands;
794 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
796 DPSOFTRAST_Draw_SyncCommands();
802 for (i = 0; i < dpsoftrast.numthreads; i++)
804 thread = &dpsoftrast.threads[i];
805 commandoffset = freecommand - thread->commandoffset;
806 if (commandoffset < 0)
807 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
808 if (commandoffset > usedcommands)
811 usedcommands = commandoffset;
814 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
816 thread = &dpsoftrast.threads[waitindex];
817 Thread_LockMutex(thread->drawmutex);
818 if (thread->commandoffset != dpsoftrast.drawcommand)
820 thread->waiting = true;
821 if (thread->starving) Thread_CondSignal(thread->drawcond);
822 Thread_CondWait(thread->waitcond, thread->drawmutex);
823 thread->waiting = false;
825 Thread_UnlockMutex(thread->drawmutex);
827 dpsoftrast.commandpool.usedcommands = usedcommands;
830 #define DPSOFTRAST_ALIGNCOMMAND(size) \
831 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
832 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
833 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
835 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
837 DPSOFTRAST_Command *command;
838 int freecommand = dpsoftrast.commandpool.freecommand;
839 int usedcommands = dpsoftrast.commandpool.usedcommands;
840 int extra = sizeof(DPSOFTRAST_Command);
841 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
842 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
843 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
845 if (dpsoftrast.usethreads)
846 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
848 DPSOFTRAST_Draw_FlushThreads();
849 freecommand = dpsoftrast.commandpool.freecommand;
850 usedcommands = dpsoftrast.commandpool.usedcommands;
852 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
854 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
855 command->opcode = DPSOFTRAST_OPCODE_Reset;
856 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
859 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
860 command->opcode = opcode;
861 command->commandsize = size;
863 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
865 dpsoftrast.commandpool.freecommand = freecommand;
866 dpsoftrast.commandpool.usedcommands = usedcommands + size;
870 static void DPSOFTRAST_UndoCommand(int size)
872 int freecommand = dpsoftrast.commandpool.freecommand;
873 int usedcommands = dpsoftrast.commandpool.usedcommands;
876 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
877 usedcommands -= size;
878 dpsoftrast.commandpool.freecommand = freecommand;
879 dpsoftrast.commandpool.usedcommands = usedcommands;
882 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
883 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
885 thread->viewport[0] = command->x;
886 thread->viewport[1] = command->y;
887 thread->viewport[2] = command->width;
888 thread->viewport[3] = command->height;
889 thread->validate |= DPSOFTRAST_VALIDATE_FB;
891 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
893 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
896 command->width = width;
897 command->height = height;
899 dpsoftrast.viewport[0] = x;
900 dpsoftrast.viewport[1] = y;
901 dpsoftrast.viewport[2] = width;
902 dpsoftrast.viewport[3] = height;
903 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
906 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
907 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
909 int i, x1, y1, x2, y2, w, h, x, y;
910 int miny1, maxy1, miny2, maxy2;
914 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
915 miny1 = thread->miny1;
916 maxy1 = thread->maxy1;
917 miny2 = thread->miny2;
918 maxy2 = thread->maxy2;
919 x1 = thread->fb_scissor[0];
920 y1 = thread->fb_scissor[1];
921 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
922 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
923 if (y1 < miny1) y1 = miny1;
924 if (y2 > maxy2) y2 = maxy2;
929 // FIXME: honor fb_colormask?
930 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
931 for (i = 0;i < 4;i++)
933 if (!dpsoftrast.fb_colorpixels[i])
935 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
938 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
939 for (x = x1;x < x2;x++)
944 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
946 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
953 DEFCOMMAND(3, ClearDepth, float depth;)
954 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
956 int x1, y1, x2, y2, w, h, x, y;
957 int miny1, maxy1, miny2, maxy2;
961 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
962 miny1 = thread->miny1;
963 maxy1 = thread->maxy1;
964 miny2 = thread->miny2;
965 maxy2 = thread->maxy2;
966 x1 = thread->fb_scissor[0];
967 y1 = thread->fb_scissor[1];
968 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
969 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
970 if (y1 < miny1) y1 = miny1;
971 if (y2 > maxy2) y2 = maxy2;
976 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
977 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
980 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
981 for (x = x1;x < x2;x++)
985 void DPSOFTRAST_ClearDepth(float d)
987 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
991 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
992 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
994 thread->colormask[0] = command->r != 0;
995 thread->colormask[1] = command->g != 0;
996 thread->colormask[2] = command->b != 0;
997 thread->colormask[3] = command->a != 0;
998 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1000 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1002 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1009 DEFCOMMAND(5, DepthTest, int enable;)
1010 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1012 thread->depthtest = command->enable;
1013 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1015 void DPSOFTRAST_DepthTest(int enable)
1017 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1018 command->enable = enable;
1021 DEFCOMMAND(6, ScissorTest, int enable;)
1022 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1024 thread->scissortest = command->enable;
1025 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1027 void DPSOFTRAST_ScissorTest(int enable)
1029 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1030 command->enable = enable;
1033 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1034 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1036 thread->scissor[0] = command->x;
1037 thread->scissor[1] = command->y;
1038 thread->scissor[2] = command->width;
1039 thread->scissor[3] = command->height;
1040 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1042 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1044 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1047 command->width = width;
1048 command->height = height;
1051 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1052 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1054 thread->blendfunc[0] = command->sfactor;
1055 thread->blendfunc[1] = command->dfactor;
1056 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1058 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1060 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1061 command->sfactor = sfactor;
1062 command->dfactor = dfactor;
1065 DEFCOMMAND(9, BlendSubtract, int enable;)
1066 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1068 thread->blendsubtract = command->enable;
1069 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1071 void DPSOFTRAST_BlendSubtract(int enable)
1073 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1074 command->enable = enable;
1077 DEFCOMMAND(10, DepthMask, int enable;)
1078 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1080 thread->depthmask = command->enable;
1082 void DPSOFTRAST_DepthMask(int enable)
1084 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1085 command->enable = enable;
1088 DEFCOMMAND(11, DepthFunc, int func;)
1089 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1091 thread->depthfunc = command->func;
1093 void DPSOFTRAST_DepthFunc(int func)
1095 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1096 command->func = func;
1099 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1100 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1102 thread->depthrange[0] = command->nearval;
1103 thread->depthrange[1] = command->farval;
1105 void DPSOFTRAST_DepthRange(float nearval, float farval)
1107 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1108 command->nearval = nearval;
1109 command->farval = farval;
1112 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1113 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1115 thread->polygonoffset[0] = command->alongnormal;
1116 thread->polygonoffset[1] = command->intoview;
1118 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1120 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1121 command->alongnormal = alongnormal;
1122 command->intoview = intoview;
1125 DEFCOMMAND(14, CullFace, int mode;)
1126 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1128 thread->cullface = command->mode;
1130 void DPSOFTRAST_CullFace(int mode)
1132 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1133 command->mode = mode;
1136 DEFCOMMAND(15, AlphaTest, int enable;)
1137 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1139 thread->alphatest = command->enable;
1141 void DPSOFTRAST_AlphaTest(int enable)
1143 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1144 command->enable = enable;
1147 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1148 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1150 thread->alphafunc = command->func;
1151 thread->alphavalue = command->ref;
1153 void DPSOFTRAST_AlphaFunc(int func, float ref)
1155 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1156 command->func = func;
1160 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1162 dpsoftrast.color[0] = r;
1163 dpsoftrast.color[1] = g;
1164 dpsoftrast.color[2] = b;
1165 dpsoftrast.color[3] = a;
1168 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1170 int outstride = blockwidth * 4;
1171 int instride = dpsoftrast.fb_width * 4;
1174 int bx2 = blockx + blockwidth;
1175 int by2 = blocky + blockheight;
1179 unsigned char *inpixels;
1183 if (bx1 < 0) bx1 = 0;
1184 if (by1 < 0) by1 = 0;
1185 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1186 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1188 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1189 if (dpsoftrast.bigendian)
1191 for (y = by1;y < by2;y++)
1193 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1194 o = (unsigned char *)outpixels + (y - by1) * outstride;
1195 for (x = bx1;x < bx2;x++)
1208 for (y = by1;y < by2;y++)
1210 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1211 o = (unsigned char *)outpixels + (y - by1) * outstride;
1217 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1221 int tx2 = tx + width;
1222 int ty2 = ty + height;
1225 int sx2 = sx + width;
1226 int sy2 = sy + height;
1236 unsigned int *spixels;
1237 unsigned int *tpixels;
1238 DPSOFTRAST_Texture *texture;
1239 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1240 if (mip < 0 || mip >= texture->mipmaps) return;
1242 spixels = dpsoftrast.fb_colorpixels[0];
1243 swidth = dpsoftrast.fb_width;
1244 sheight = dpsoftrast.fb_height;
1245 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1246 twidth = texture->mipmap[mip][2];
1247 theight = texture->mipmap[mip][3];
1248 if (tx1 < 0) tx1 = 0;
1249 if (ty1 < 0) ty1 = 0;
1250 if (tx2 > twidth) tx2 = twidth;
1251 if (ty2 > theight) ty2 = theight;
1252 if (sx1 < 0) sx1 = 0;
1253 if (sy1 < 0) sy1 = 0;
1254 if (sx2 > swidth) sx2 = swidth;
1255 if (sy2 > sheight) sy2 = sheight;
1260 if (tw > sw) tw = sw;
1261 if (th > sh) th = sh;
1262 if (tw < 1 || th < 1)
1264 sy1 = sheight - 1 - sy1;
1265 for (y = 0;y < th;y++)
1266 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1267 if (texture->mipmaps > 1)
1268 DPSOFTRAST_Texture_CalculateMipmaps(index);
1271 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1272 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1274 if (thread->texbound[command->unitnum])
1275 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1276 thread->texbound[command->unitnum] = command->texture;
1278 void DPSOFTRAST_SetTexture(int unitnum, int index)
1280 DPSOFTRAST_Command_SetTexture *command;
1281 DPSOFTRAST_Texture *texture;
1282 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1284 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1287 texture = DPSOFTRAST_Texture_GetByIndex(index);
1288 if (index && !texture)
1290 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1294 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1295 command->unitnum = unitnum;
1296 command->texture = texture;
1298 dpsoftrast.texbound[unitnum] = texture;
1299 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1302 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1304 dpsoftrast.pointer_vertex3f = vertex3f;
1305 dpsoftrast.stride_vertex = stride;
1307 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1309 dpsoftrast.pointer_color4f = color4f;
1310 dpsoftrast.pointer_color4ub = NULL;
1311 dpsoftrast.stride_color = stride;
1313 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1315 dpsoftrast.pointer_color4f = NULL;
1316 dpsoftrast.pointer_color4ub = color4ub;
1317 dpsoftrast.stride_color = stride;
1319 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1321 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1322 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1323 dpsoftrast.stride_texcoord[unitnum] = stride;
1326 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1327 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1329 thread->shader_mode = command->mode;
1330 thread->shader_permutation = command->permutation;
1331 thread->shader_exactspecularmath = command->exactspecularmath;
1333 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1335 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1336 command->mode = mode;
1337 command->permutation = permutation;
1338 command->exactspecularmath = exactspecularmath;
1340 dpsoftrast.shader_mode = mode;
1341 dpsoftrast.shader_permutation = permutation;
1342 dpsoftrast.shader_exactspecularmath = exactspecularmath;
1345 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1346 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1348 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1350 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1352 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1353 command->index = index;
1354 command->val[0] = v0;
1355 command->val[1] = v1;
1356 command->val[2] = v2;
1357 command->val[3] = v3;
1359 dpsoftrast.uniform4f[index*4+0] = v0;
1360 dpsoftrast.uniform4f[index*4+1] = v1;
1361 dpsoftrast.uniform4f[index*4+2] = v2;
1362 dpsoftrast.uniform4f[index*4+3] = v3;
1364 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1366 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1367 command->index = index;
1368 memcpy(command->val, v, sizeof(command->val));
1370 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1373 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1374 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1376 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1378 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1382 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1384 __m128 m0, m1, m2, m3;
1385 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1386 command->index = (DPSOFTRAST_UNIFORM)index;
1387 if (((size_t)v)&(ALIGN_SIZE-1))
1389 m0 = _mm_loadu_ps(v);
1390 m1 = _mm_loadu_ps(v+4);
1391 m2 = _mm_loadu_ps(v+8);
1392 m3 = _mm_loadu_ps(v+12);
1396 m0 = _mm_load_ps(v);
1397 m1 = _mm_load_ps(v+4);
1398 m2 = _mm_load_ps(v+8);
1399 m3 = _mm_load_ps(v+12);
1403 __m128 t0, t1, t2, t3;
1404 t0 = _mm_unpacklo_ps(m0, m1);
1405 t1 = _mm_unpacklo_ps(m2, m3);
1406 t2 = _mm_unpackhi_ps(m0, m1);
1407 t3 = _mm_unpackhi_ps(m2, m3);
1408 m0 = _mm_movelh_ps(t0, t1);
1409 m1 = _mm_movehl_ps(t1, t0);
1410 m2 = _mm_movelh_ps(t2, t3);
1411 m3 = _mm_movehl_ps(t3, t2);
1413 _mm_store_ps(command->val, m0);
1414 _mm_store_ps(command->val+4, m1);
1415 _mm_store_ps(command->val+8, m2);
1416 _mm_store_ps(command->val+12, m3);
1417 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1418 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1419 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1420 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1425 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1426 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1428 thread->uniform1i[command->index] = command->val;
1430 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1432 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1433 command->index = index;
1436 dpsoftrast.uniform1i[command->index] = i0;
1439 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1440 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1442 memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1444 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1446 DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1447 x /= dpsoftrast.fb_viewportscale[1];
1448 y /= dpsoftrast.fb_viewportscale[2];
1449 z /= dpsoftrast.fb_viewportscale[3];
1450 w /= dpsoftrast.fb_viewportscale[0];
1451 w -= dpsoftrast.fb_viewportcenter[1]*x + dpsoftrast.fb_viewportcenter[2]*y + dpsoftrast.fb_viewportcenter[3]*z + dpsoftrast.fb_viewportcenter[0]*w;
1452 command->clipplane[0] = x;
1453 command->clipplane[1] = y;
1454 command->clipplane[2] = z;
1455 command->clipplane[3] = w;
1459 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1461 float *end = dst + size*4;
1462 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1466 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1475 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1482 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1484 float *end = dst + size*4;
1485 if (stride == sizeof(float[3]))
1487 float *end4 = dst + (size&~3)*4;
1488 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1492 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1493 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1494 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1495 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1496 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1497 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1498 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1499 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1500 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1501 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1502 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1503 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1504 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1506 src += 4*sizeof(float[3]);
1513 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1514 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1515 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1516 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1517 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1518 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1519 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1520 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1521 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1522 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1523 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1524 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1525 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1527 src += 4*sizeof(float[3]);
1531 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1535 __m128 v = _mm_loadu_ps((const float *)src);
1536 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1537 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1538 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1539 _mm_store_ps(dst, v);
1548 __m128 v = _mm_load_ps((const float *)src);
1549 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1550 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1551 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1552 _mm_store_ps(dst, v);
1559 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1561 float *end = dst + size*4;
1562 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1563 if (stride == sizeof(float[2]))
1565 float *end2 = dst + (size&~1)*4;
1566 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1570 __m128 v = _mm_loadu_ps((const float *)src);
1571 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1572 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1574 src += 2*sizeof(float[2]);
1581 __m128 v = _mm_load_ps((const float *)src);
1582 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1583 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1585 src += 2*sizeof(float[2]);
1591 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1597 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1599 float *end = dst + size*4;
1600 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1601 if (stride == sizeof(unsigned char[4]))
1603 float *end4 = dst + (size&~3)*4;
1604 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1608 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1609 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1610 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1611 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1612 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1614 src += 4*sizeof(unsigned char[4]);
1621 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1622 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1623 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1624 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1625 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1627 src += 4*sizeof(unsigned char[4]);
1633 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1634 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1640 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1642 float *end = dst + 4*size;
1643 __m128 v = _mm_loadu_ps(src);
1646 _mm_store_ps(dst, v);
1652 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1655 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1656 __m128 m0, m1, m2, m3;
1658 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1660 // fast case for identity matrix
1661 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1664 end = out4f + numitems*4;
1665 m0 = _mm_loadu_ps(inmatrix16f);
1666 m1 = _mm_loadu_ps(inmatrix16f + 4);
1667 m2 = _mm_loadu_ps(inmatrix16f + 8);
1668 m3 = _mm_loadu_ps(inmatrix16f + 12);
1669 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1673 __m128 v = _mm_loadu_ps(in4f);
1675 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1676 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1677 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1678 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1687 __m128 v = _mm_load_ps(in4f);
1689 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1690 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1691 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1692 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1700 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1702 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1706 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1708 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1709 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1710 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1711 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1714 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1716 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1717 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1718 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1719 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1722 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1725 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1726 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1727 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1728 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1731 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1733 int clipmask = 0xFF;
1734 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1735 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1736 __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1737 __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1738 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1739 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1740 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1741 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1742 #define BBFRONT(k, pos) \
1744 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1745 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1746 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1749 clipmask &= ~(1<<k); \
1750 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1751 minproj = _mm_min_ss(minproj, proj); \
1752 maxproj = _mm_max_ss(maxproj, proj); \
1756 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1757 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1758 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1759 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1760 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1761 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1765 if (clipmask&(1<<k)) \
1767 if (!(clipmask&(1<<(k^1)))) \
1769 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1770 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1771 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1772 minproj = _mm_min_ss(minproj, proj); \
1773 maxproj = _mm_max_ss(maxproj, proj); \
1775 if (!(clipmask&(1<<(k^2)))) \
1777 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1778 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1779 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1780 minproj = _mm_min_ss(minproj, proj); \
1781 maxproj = _mm_max_ss(maxproj, proj); \
1783 if (!(clipmask&(1<<(k^4)))) \
1785 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1786 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1787 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1788 minproj = _mm_min_ss(minproj, proj); \
1789 maxproj = _mm_max_ss(maxproj, proj); \
1793 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1794 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1795 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1796 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1797 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1798 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1799 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1800 *starty = _mm_cvttss_si32(maxproj);
1801 *endy = _mm_cvttss_si32(minproj)+1;
1805 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1807 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1808 float *end = out4f + numitems*4;
1809 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1810 __m128 minpos, maxpos;
1811 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1813 minpos = maxpos = _mm_loadu_ps(in4f);
1816 __m128 v = _mm_loadu_ps(in4f);
1817 minpos = _mm_min_ps(minpos, v);
1818 maxpos = _mm_max_ps(maxpos, v);
1819 _mm_store_ps(out4f, v);
1820 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1821 _mm_store_ps(screen4f, v);
1829 minpos = maxpos = _mm_load_ps(in4f);
1832 __m128 v = _mm_load_ps(in4f);
1833 minpos = _mm_min_ps(minpos, v);
1834 maxpos = _mm_max_ps(maxpos, v);
1835 _mm_store_ps(out4f, v);
1836 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1837 _mm_store_ps(screen4f, v);
1845 ALIGN(float minposf[4]);
1846 ALIGN(float maxposf[4]);
1847 _mm_store_ps(minposf, minpos);
1848 _mm_store_ps(maxposf, maxpos);
1849 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1854 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1856 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1857 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1859 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1860 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1861 end = out4f + numitems*4;
1862 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1863 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1864 m0 = _mm_loadu_ps(inmatrix16f);
1865 m1 = _mm_loadu_ps(inmatrix16f + 4);
1866 m2 = _mm_loadu_ps(inmatrix16f + 8);
1867 m3 = _mm_loadu_ps(inmatrix16f + 12);
1868 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1870 minpos = maxpos = _mm_loadu_ps(in4f);
1873 __m128 v = _mm_loadu_ps(in4f);
1874 minpos = _mm_min_ps(minpos, v);
1875 maxpos = _mm_max_ps(maxpos, v);
1876 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1877 _mm_store_ps(out4f, v);
1878 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1879 _mm_store_ps(screen4f, v);
1887 minpos = maxpos = _mm_load_ps(in4f);
1890 __m128 v = _mm_load_ps(in4f);
1891 minpos = _mm_min_ps(minpos, v);
1892 maxpos = _mm_max_ps(maxpos, v);
1893 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1894 _mm_store_ps(out4f, v);
1895 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1896 _mm_store_ps(screen4f, v);
1904 ALIGN(float minposf[4]);
1905 ALIGN(float maxposf[4]);
1906 _mm_store_ps(minposf, minpos);
1907 _mm_store_ps(maxposf, maxpos);
1908 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f);
1914 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1917 float *outf = dpsoftrast.post_array4f[outarray];
1918 const unsigned char *inb;
1919 int firstvertex = dpsoftrast.firstvertex;
1920 int numvertices = dpsoftrast.numvertices;
1924 case DPSOFTRAST_ARRAY_POSITION:
1925 stride = dpsoftrast.stride_vertex;
1926 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1927 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1929 case DPSOFTRAST_ARRAY_COLOR:
1930 stride = dpsoftrast.stride_color;
1931 if (dpsoftrast.pointer_color4f)
1933 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1934 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1936 else if (dpsoftrast.pointer_color4ub)
1938 stride = dpsoftrast.stride_color;
1939 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1940 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1944 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1948 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1949 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1951 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1952 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1955 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1958 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1961 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1973 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1975 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1976 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1981 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1984 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1985 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1993 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1996 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1997 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2004 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2007 int startx = span->startx;
2008 int endx = span->endx;
2009 float wslope = triangle->w[0];
2010 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2011 float endz = 1.0f / (w + wslope * startx);
2012 if (triangle->w[0] == 0)
2014 // LordHavoc: fast flat polygons (HUD/menu)
2015 for (x = startx;x < endx;x++)
2019 for (x = startx;x < endx;)
2021 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2023 if (nextsub >= endx) nextsub = endsub = endx-1;
2024 endz = 1.0f / (w + wslope * nextsub);
2025 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2026 for (; x <= endsub; x++, z += dz)
2031 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
2034 int startx = span->startx;
2035 int endx = span->endx;
2038 unsigned char * RESTRICT pixelmask = span->pixelmask;
2039 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2042 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2043 // handle alphatest now (this affects depth writes too)
2044 if (thread->alphatest)
2045 for (x = startx;x < endx;x++)
2046 if (in4f[x*4+3] < 0.5f)
2047 pixelmask[x] = false;
2048 // FIXME: this does not handle bigendian
2049 switch(thread->fb_blendmode)
2051 case DPSOFTRAST_BLENDMODE_OPAQUE:
2052 for (x = startx;x < endx;x++)
2056 d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
2057 d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
2058 d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
2059 d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
2060 pixel[x*4+0] = d[0];
2061 pixel[x*4+1] = d[1];
2062 pixel[x*4+2] = d[2];
2063 pixel[x*4+3] = d[3];
2066 case DPSOFTRAST_BLENDMODE_ALPHA:
2067 for (x = startx;x < endx;x++)
2071 a = in4f[x*4+3] * 255.0f;
2072 b = 1.0f - in4f[x*4+3];
2073 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2074 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2075 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2076 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2077 pixel[x*4+0] = d[0];
2078 pixel[x*4+1] = d[1];
2079 pixel[x*4+2] = d[2];
2080 pixel[x*4+3] = d[3];
2083 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2084 for (x = startx;x < endx;x++)
2088 a = in4f[x*4+3] * 255.0f;
2089 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2090 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2091 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2092 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2093 pixel[x*4+0] = d[0];
2094 pixel[x*4+1] = d[1];
2095 pixel[x*4+2] = d[2];
2096 pixel[x*4+3] = d[3];
2099 case DPSOFTRAST_BLENDMODE_ADD:
2100 for (x = startx;x < endx;x++)
2104 d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2105 d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2106 d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2107 d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2108 pixel[x*4+0] = d[0];
2109 pixel[x*4+1] = d[1];
2110 pixel[x*4+2] = d[2];
2111 pixel[x*4+3] = d[3];
2114 case DPSOFTRAST_BLENDMODE_INVMOD:
2115 for (x = startx;x < endx;x++)
2119 d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2120 d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2121 d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2122 d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2123 pixel[x*4+0] = d[0];
2124 pixel[x*4+1] = d[1];
2125 pixel[x*4+2] = d[2];
2126 pixel[x*4+3] = d[3];
2129 case DPSOFTRAST_BLENDMODE_MUL:
2130 for (x = startx;x < endx;x++)
2134 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2135 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2136 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2137 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2138 pixel[x*4+0] = d[0];
2139 pixel[x*4+1] = d[1];
2140 pixel[x*4+2] = d[2];
2141 pixel[x*4+3] = d[3];
2144 case DPSOFTRAST_BLENDMODE_MUL2:
2145 for (x = startx;x < endx;x++)
2149 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2150 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2151 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2152 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2153 pixel[x*4+0] = d[0];
2154 pixel[x*4+1] = d[1];
2155 pixel[x*4+2] = d[2];
2156 pixel[x*4+3] = d[3];
2159 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2160 for (x = startx;x < endx;x++)
2164 a = in4f[x*4+3] * -255.0f;
2165 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2166 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2167 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2168 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2169 pixel[x*4+0] = d[0];
2170 pixel[x*4+1] = d[1];
2171 pixel[x*4+2] = d[2];
2172 pixel[x*4+3] = d[3];
2175 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2176 for (x = startx;x < endx;x++)
2181 b = 1.0f - in4f[x*4+3];
2182 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2183 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2184 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2185 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2186 pixel[x*4+0] = d[0];
2187 pixel[x*4+1] = d[1];
2188 pixel[x*4+2] = d[2];
2189 pixel[x*4+3] = d[3];
2192 case DPSOFTRAST_BLENDMODE_INVADD:
2193 for (x = startx;x < endx;x++)
2197 d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2198 d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2199 d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2200 d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2201 pixel[x*4+0] = d[0];
2202 pixel[x*4+1] = d[1];
2203 pixel[x*4+2] = d[2];
2204 pixel[x*4+3] = d[3];
2210 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2214 int startx = span->startx;
2215 int endx = span->endx;
2217 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2218 unsigned char * RESTRICT pixelmask = span->pixelmask;
2219 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2220 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2223 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2224 pixeli += span->y * dpsoftrast.fb_width + span->x;
2225 // handle alphatest now (this affects depth writes too)
2226 if (thread->alphatest)
2227 for (x = startx;x < endx;x++)
2228 if (in4ub[x*4+3] < 128)
2229 pixelmask[x] = false;
2230 // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2231 // helps sprites, text and hud artwork
2232 switch(thread->fb_blendmode)
2234 case DPSOFTRAST_BLENDMODE_ALPHA:
2235 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2236 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2237 for (x = startx;x < endx;x++)
2238 if (in4ub[x*4+3] < 1)
2239 pixelmask[x] = false;
2241 case DPSOFTRAST_BLENDMODE_OPAQUE:
2242 case DPSOFTRAST_BLENDMODE_ADD:
2243 case DPSOFTRAST_BLENDMODE_INVMOD:
2244 case DPSOFTRAST_BLENDMODE_MUL:
2245 case DPSOFTRAST_BLENDMODE_MUL2:
2246 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2247 case DPSOFTRAST_BLENDMODE_INVADD:
2250 // put some special values at the end of the mask to ensure the loops end
2251 pixelmask[endx] = 1;
2252 pixelmask[endx+1] = 0;
2253 // LordHavoc: use a double loop to identify subspans, this helps the
2254 // optimized copy/blend loops to perform at their best, most triangles
2255 // have only one run of pixels, and do the search using wide reads...
2259 // if this pixel is masked off, it's probably not alone...
2266 // the 4-item search must be aligned or else it stalls badly
2267 if ((x & 3) && !pixelmask[x]) x++;
2268 if ((x & 3) && !pixelmask[x]) x++;
2269 if ((x & 3) && !pixelmask[x]) x++;
2270 while (*((unsigned int *)pixelmask + x) == 0x00000000)
2274 for (;!pixelmask[x];x++)
2276 // rather than continue the loop, just check the end variable
2280 // find length of subspan
2285 if ((subx & 3) && pixelmask[subx]) subx++;
2286 if ((subx & 3) && pixelmask[subx]) subx++;
2287 if ((subx & 3) && pixelmask[subx]) subx++;
2288 while (*((unsigned int *)pixelmask + subx) == 0x01010101)
2292 for (;pixelmask[subx];subx++)
2294 // the checks can overshoot, so make sure to clip it...
2297 // now that we know the subspan length... process!
2298 switch(thread->fb_blendmode)
2300 case DPSOFTRAST_BLENDMODE_OPAQUE:
2304 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2309 while (x + 16 <= subx)
2311 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2312 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2313 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2314 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2319 while (x + 4 <= subx)
2321 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2327 pixeli[x+1] = ini[x+1];
2337 case DPSOFTRAST_BLENDMODE_ALPHA:
2338 #define FINISHBLEND(blend2, blend1) \
2339 for (;x + 1 < subx;x += 2) \
2342 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2343 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2345 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2350 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2351 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2353 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2357 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2358 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2360 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2361 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2364 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2366 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2367 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2369 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2370 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2373 case DPSOFTRAST_BLENDMODE_ADD:
2374 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2376 case DPSOFTRAST_BLENDMODE_INVMOD:
2378 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2380 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2383 case DPSOFTRAST_BLENDMODE_MUL:
2384 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2386 case DPSOFTRAST_BLENDMODE_MUL2:
2387 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2389 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2391 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2392 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2394 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2395 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2398 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2400 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2401 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2403 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2404 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2407 case DPSOFTRAST_BLENDMODE_INVADD:
2409 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2411 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2419 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2422 int startx = span->startx;
2423 int endx = span->endx;
2428 float tc[2], endtc[2];
2430 unsigned int tci[2];
2431 unsigned int tci1[2];
2432 unsigned int tcimin[2];
2433 unsigned int tcimax[2];
2438 const unsigned char * RESTRICT pixelbase;
2439 const unsigned char * RESTRICT pixel[4];
2440 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2441 // if no texture is bound, just fill it with white
2444 for (x = startx;x < endx;x++)
2446 out4f[x*4+0] = 1.0f;
2447 out4f[x*4+1] = 1.0f;
2448 out4f[x*4+2] = 1.0f;
2449 out4f[x*4+3] = 1.0f;
2453 mip = triangle->mip[texunitindex];
2454 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2455 // if this mipmap of the texture is 1 pixel, just fill it with that color
2456 if (texture->mipmap[mip][1] == 4)
2458 c[0] = texture->bytes[2] * (1.0f/255.0f);
2459 c[1] = texture->bytes[1] * (1.0f/255.0f);
2460 c[2] = texture->bytes[0] * (1.0f/255.0f);
2461 c[3] = texture->bytes[3] * (1.0f/255.0f);
2462 for (x = startx;x < endx;x++)
2464 out4f[x*4+0] = c[0];
2465 out4f[x*4+1] = c[1];
2466 out4f[x*4+2] = c[2];
2467 out4f[x*4+3] = c[3];
2471 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2472 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2473 flags = texture->flags;
2474 tcscale[0] = texture->mipmap[mip][2];
2475 tcscale[1] = texture->mipmap[mip][3];
2476 tciwidth = texture->mipmap[mip][2];
2479 tcimax[0] = texture->mipmap[mip][2]-1;
2480 tcimax[1] = texture->mipmap[mip][3]-1;
2481 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2482 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2483 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2484 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2490 for (x = startx;x < endx;)
2492 unsigned int subtc[2];
2493 unsigned int substep[2];
2494 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2495 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2496 if (nextsub >= endx)
2498 nextsub = endsub = endx-1;
2499 if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2503 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2504 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2510 substep[0] = (endtc[0] - tc[0]) * subscale;
2511 substep[1] = (endtc[1] - tc[1]) * subscale;
2512 subtc[0] = tc[0] * (1<<12);
2513 subtc[1] = tc[1] * (1<<12);
2516 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2518 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2520 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2521 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2522 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2523 tci[0] = subtc[0]>>12;
2524 tci[1] = subtc[1]>>12;
2525 tci1[0] = tci[0] + 1;
2526 tci1[1] = tci[1] + 1;
2527 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2528 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2529 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2530 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2531 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2532 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2533 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2534 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2535 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2536 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2537 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2538 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2539 out4f[x*4+0] = c[0];
2540 out4f[x*4+1] = c[1];
2541 out4f[x*4+2] = c[2];
2542 out4f[x*4+3] = c[3];
2547 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2549 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2550 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2551 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2552 tci[0] = subtc[0]>>12;
2553 tci[1] = subtc[1]>>12;
2554 tci1[0] = tci[0] + 1;
2555 tci1[1] = tci[1] + 1;
2556 tci[0] &= tciwrapmask[0];
2557 tci[1] &= tciwrapmask[1];
2558 tci1[0] &= tciwrapmask[0];
2559 tci1[1] &= tciwrapmask[1];
2560 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2561 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2562 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2563 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2564 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2565 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2566 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2567 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2568 out4f[x*4+0] = c[0];
2569 out4f[x*4+1] = c[1];
2570 out4f[x*4+2] = c[2];
2571 out4f[x*4+3] = c[3];
2575 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2577 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2579 tci[0] = subtc[0]>>12;
2580 tci[1] = subtc[1]>>12;
2581 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2582 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2583 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2584 c[0] = pixel[0][2] * (1.0f / 255.0f);
2585 c[1] = pixel[0][1] * (1.0f / 255.0f);
2586 c[2] = pixel[0][0] * (1.0f / 255.0f);
2587 c[3] = pixel[0][3] * (1.0f / 255.0f);
2588 out4f[x*4+0] = c[0];
2589 out4f[x*4+1] = c[1];
2590 out4f[x*4+2] = c[2];
2591 out4f[x*4+3] = c[3];
2596 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2598 tci[0] = subtc[0]>>12;
2599 tci[1] = subtc[1]>>12;
2600 tci[0] &= tciwrapmask[0];
2601 tci[1] &= tciwrapmask[1];
2602 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2603 c[0] = pixel[0][2] * (1.0f / 255.0f);
2604 c[1] = pixel[0][1] * (1.0f / 255.0f);
2605 c[2] = pixel[0][0] * (1.0f / 255.0f);
2606 c[3] = pixel[0][3] * (1.0f / 255.0f);
2607 out4f[x*4+0] = c[0];
2608 out4f[x*4+1] = c[1];
2609 out4f[x*4+2] = c[2];
2610 out4f[x*4+3] = c[3];
2616 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2620 int startx = span->startx;
2621 int endx = span->endx;
2623 __m128 data, slope, tcscale;
2624 __m128i tcsize, tcmask, tcoffset, tcmax;
2626 __m128i subtc, substep, endsubtc;
2629 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2630 const unsigned char * RESTRICT pixelbase;
2631 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2632 // if no texture is bound, just fill it with white
2635 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2638 mip = triangle->mip[texunitindex];
2639 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2640 // if this mipmap of the texture is 1 pixel, just fill it with that color
2641 if (texture->mipmap[mip][1] == 4)
2643 unsigned int k = *((const unsigned int *)pixelbase);
2644 for (x = startx;x < endx;x++)
2648 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2649 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2650 flags = texture->flags;
2651 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2652 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2653 tcscale = _mm_cvtepi32_ps(tcsize);
2654 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2655 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2656 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2658 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2659 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2660 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2661 tcmax = _mm_packs_epi32(tcmask, tcmask);
2662 for (x = startx;x < endx;)
2664 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2665 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2666 if (nextsub >= endx)
2668 nextsub = endsub = endx-1;
2669 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2673 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2675 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2676 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2677 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2678 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2679 substep = _mm_slli_epi32(substep, 1);
2682 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2683 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2685 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2686 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2688 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2689 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2690 tci = _mm_madd_epi16(tci, tcoffset);
2691 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2692 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2693 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2694 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2695 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2696 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2697 fracm = _mm_srli_epi16(subtc, 1);
2698 pix1 = _mm_add_epi16(pix1,
2699 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2700 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2701 pix3 = _mm_add_epi16(pix3,
2702 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2703 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2704 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2705 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2706 pix2 = _mm_add_epi16(pix2,
2707 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2708 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2709 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2713 const unsigned char * RESTRICT ptr1;
2714 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2715 tci = _mm_madd_epi16(tci, tcoffset);
2716 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2717 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2718 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2719 fracm = _mm_srli_epi16(subtc, 1);
2720 pix1 = _mm_add_epi16(pix1,
2721 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2722 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2723 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2724 pix1 = _mm_add_epi16(pix1,
2725 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2726 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2727 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2731 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2733 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2735 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2736 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2737 tci = _mm_madd_epi16(tci, tcoffset);
2738 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2739 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2740 _mm_setzero_si128());
2741 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2742 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2743 _mm_setzero_si128());
2744 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2745 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2746 tci = _mm_madd_epi16(tci, tcoffset);
2747 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2748 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2749 _mm_setzero_si128());
2750 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2751 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2752 _mm_setzero_si128());
2753 fracm = _mm_srli_epi16(subtc, 1);
2754 pix1 = _mm_add_epi16(pix1,
2755 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2756 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2757 pix3 = _mm_add_epi16(pix3,
2758 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2759 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2760 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2761 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2762 pix2 = _mm_add_epi16(pix2,
2763 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2764 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2765 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2769 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2770 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2771 tci = _mm_madd_epi16(tci, tcoffset);
2772 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2773 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2774 _mm_setzero_si128());
2775 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2776 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2777 _mm_setzero_si128());
2778 fracm = _mm_srli_epi16(subtc, 1);
2779 pix1 = _mm_add_epi16(pix1,
2780 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2781 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2782 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2783 pix1 = _mm_add_epi16(pix1,
2784 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2785 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2786 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2792 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2794 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2795 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2796 tci = _mm_madd_epi16(tci, tcoffset);
2797 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2798 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2799 _mm_setzero_si128());
2800 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2801 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2802 _mm_setzero_si128());
2803 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2804 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2805 tci = _mm_madd_epi16(tci, tcoffset);
2806 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2807 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2808 _mm_setzero_si128());
2809 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2810 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2811 _mm_setzero_si128());
2812 fracm = _mm_srli_epi16(subtc, 1);
2813 pix1 = _mm_add_epi16(pix1,
2814 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2815 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2816 pix3 = _mm_add_epi16(pix3,
2817 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2818 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2819 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2820 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2821 pix2 = _mm_add_epi16(pix2,
2822 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2823 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2824 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2828 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2829 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2830 tci = _mm_madd_epi16(tci, tcoffset);
2831 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2832 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2833 _mm_setzero_si128());
2834 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2835 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2836 _mm_setzero_si128());
2837 fracm = _mm_srli_epi16(subtc, 1);
2838 pix1 = _mm_add_epi16(pix1,
2839 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2840 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2841 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2842 pix1 = _mm_add_epi16(pix1,
2843 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2844 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2845 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2852 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2854 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2856 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2857 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2858 tci = _mm_madd_epi16(tci, tcoffset);
2859 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2860 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2864 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2865 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2866 tci = _mm_madd_epi16(tci, tcoffset);
2867 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2873 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2875 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2876 tci = _mm_and_si128(tci, tcmax);
2877 tci = _mm_madd_epi16(tci, tcoffset);
2878 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2879 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2883 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2884 tci = _mm_and_si128(tci, tcmax);
2885 tci = _mm_madd_epi16(tci, tcoffset);
2886 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2895 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2898 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2901 float DPSOFTRAST_SampleShadowmap(const float *vector)
2907 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2910 int startx = span->startx;
2911 int endx = span->endx;
2916 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2917 for (x = startx;x < endx;x++)
2920 c[0] = (data[0] + slope[0]*x) * z;
2921 c[1] = (data[1] + slope[1]*x) * z;
2922 c[2] = (data[2] + slope[2]*x) * z;
2923 c[3] = (data[3] + slope[3]*x) * z;
2924 out4f[x*4+0] = in4f[x*4+0] * c[0];
2925 out4f[x*4+1] = in4f[x*4+1] * c[1];
2926 out4f[x*4+2] = in4f[x*4+2] * c[2];
2927 out4f[x*4+3] = in4f[x*4+3] * c[3];
2931 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2934 int startx = span->startx;
2935 int endx = span->endx;
2940 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2941 for (x = startx;x < endx;x++)
2944 c[0] = (data[0] + slope[0]*x) * z;
2945 c[1] = (data[1] + slope[1]*x) * z;
2946 c[2] = (data[2] + slope[2]*x) * z;
2947 c[3] = (data[3] + slope[3]*x) * z;
2948 out4f[x*4+0] = c[0];
2949 out4f[x*4+1] = c[1];
2950 out4f[x*4+2] = c[2];
2951 out4f[x*4+3] = c[3];
2955 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2957 int x, startx = span->startx, endx = span->endx;
2958 float c[4], localcolor[4];
2959 localcolor[0] = subcolor[0];
2960 localcolor[1] = subcolor[1];
2961 localcolor[2] = subcolor[2];
2962 localcolor[3] = subcolor[3];
2963 for (x = startx;x < endx;x++)
2965 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2966 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2967 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2968 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2969 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2970 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2971 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2972 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2976 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2978 int x, startx = span->startx, endx = span->endx;
2979 for (x = startx;x < endx;x++)
2981 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2982 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2983 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2984 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2988 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2990 int x, startx = span->startx, endx = span->endx;
2991 for (x = startx;x < endx;x++)
2993 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2994 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2995 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2996 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
3000 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
3002 int x, startx = span->startx, endx = span->endx;
3004 for (x = startx;x < endx;x++)
3006 a = 1.0f - inb4f[x*4+3];
3008 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
3009 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
3010 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
3011 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
3015 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
3017 int x, startx = span->startx, endx = span->endx;
3018 float localcolor[4], ilerp, lerp;
3019 localcolor[0] = color[0];
3020 localcolor[1] = color[1];
3021 localcolor[2] = color[2];
3022 localcolor[3] = color[3];
3023 ilerp = 1.0f - localcolor[3];
3024 lerp = localcolor[3];
3025 for (x = startx;x < endx;x++)
3027 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
3028 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
3029 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
3030 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
3036 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
3040 int startx = span->startx;
3041 int endx = span->endx;
3044 __m128i submod, substep, endsubmod;
3045 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3046 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3047 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3048 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3049 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3050 for (x = startx; x < endx;)
3052 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3053 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3054 if (nextsub >= endx)
3056 nextsub = endsub = endx-1;
3057 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
3061 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3062 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3063 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3064 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3065 substep = _mm_packs_epi32(substep, substep);
3066 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3068 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3069 pix = _mm_mulhi_epu16(pix, submod);
3070 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3074 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3075 pix = _mm_mulhi_epu16(pix, submod);
3076 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3083 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3087 int startx = span->startx;
3088 int endx = span->endx;
3091 __m128i submod, substep, endsubmod;
3092 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3093 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3094 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3095 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3096 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3097 for (x = startx; x < endx;)
3099 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3100 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3101 if (nextsub >= endx)
3103 nextsub = endsub = endx-1;
3104 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3108 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3109 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3110 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3111 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3112 substep = _mm_packs_epi32(substep, substep);
3113 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3115 __m128i pix = _mm_srai_epi16(submod, 4);
3116 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3120 __m128i pix = _mm_srai_epi16(submod, 4);
3121 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3128 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3131 int x, startx = span->startx, endx = span->endx;
3132 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3133 localcolor = _mm_packs_epi32(localcolor, localcolor);
3134 for (x = startx;x+2 <= endx;x+=2)
3136 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3137 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3138 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3139 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3143 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3144 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3145 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3146 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3151 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3154 int x, startx = span->startx, endx = span->endx;
3155 for (x = startx;x+2 <= endx;x+=2)
3157 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3158 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3159 pix1 = _mm_mulhi_epu16(pix1, pix2);
3160 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3164 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3165 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3166 pix1 = _mm_mulhi_epu16(pix1, pix2);
3167 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3172 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3175 int x, startx = span->startx, endx = span->endx;
3176 for (x = startx;x+2 <= endx;x+=2)
3178 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3179 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3180 pix1 = _mm_add_epi16(pix1, pix2);
3181 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3185 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3186 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3187 pix1 = _mm_add_epi16(pix1, pix2);
3188 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3193 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3196 int x, startx = span->startx, endx = span->endx;
3197 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3198 tint = _mm_packs_epi32(tint, tint);
3199 for (x = startx;x+2 <= endx;x+=2)
3201 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3202 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3203 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3204 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3208 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3209 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3210 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3211 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3216 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3219 int x, startx = span->startx, endx = span->endx;
3220 for (x = startx;x+2 <= endx;x+=2)
3222 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3223 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3224 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3225 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3226 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3230 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3231 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3232 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3233 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3234 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3239 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3242 int x, startx = span->startx, endx = span->endx;
3243 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3244 localcolor = _mm_packs_epi32(localcolor, localcolor);
3245 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3246 for (x = startx;x+2 <= endx;x+=2)
3248 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3249 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3250 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3254 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3255 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3256 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3263 void DPSOFTRAST_VertexShader_Generic(void)
3265 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3266 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3267 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3268 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3269 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3272 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3274 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3275 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3276 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3277 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3278 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3279 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3281 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3282 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3283 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3285 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3286 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3289 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3291 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3294 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3296 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3299 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3304 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3305 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3310 void DPSOFTRAST_VertexShader_PostProcess(void)
3312 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3313 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3314 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3317 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3319 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3320 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3321 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3322 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3323 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3324 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3325 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3327 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3328 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3330 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3331 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3333 // TODO: implement saturation
3335 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3337 // TODO: implement gammaramps
3339 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3344 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3346 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3349 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3351 // this is never called (because colormask is off when this shader is used)
3352 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3353 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3354 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3355 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3356 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3361 void DPSOFTRAST_VertexShader_FlatColor(void)
3363 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3364 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3367 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3370 unsigned char * RESTRICT pixelmask = span->pixelmask;
3371 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3372 int x, startx = span->startx, endx = span->endx;
3373 __m128i Color_Ambientm;
3374 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3375 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3376 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3377 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3378 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3379 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3380 pixel = buffer_FragColorbgra8;
3381 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3382 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3383 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3384 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3385 for (x = startx;x < endx;x++)
3388 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3391 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3392 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3393 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3394 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3400 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3401 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3402 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3404 if (pixel == buffer_FragColorbgra8)
3405 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3411 void DPSOFTRAST_VertexShader_VertexColor(void)
3413 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3414 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3415 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3418 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3421 unsigned char * RESTRICT pixelmask = span->pixelmask;
3422 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3423 int x, startx = span->startx, endx = span->endx;
3424 __m128i Color_Ambientm, Color_Diffusem;
3426 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3427 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3428 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3429 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3430 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3431 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3432 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3433 pixel = buffer_FragColorbgra8;
3434 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3435 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3436 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3437 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3438 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3439 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3440 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3441 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3442 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3443 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3444 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3445 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3446 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3447 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3449 __m128i color, mod, pix;
3450 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3453 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3454 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3455 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3456 data = _mm_add_ps(data, slope);
3457 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3458 data = _mm_add_ps(data, slope);
3459 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3460 data = _mm_add_ps(data, slope);
3461 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3462 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3463 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3464 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3465 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3466 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3472 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3473 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3474 mod = _mm_packs_epi32(mod, mod);
3475 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3476 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3478 if (pixel == buffer_FragColorbgra8)
3479 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3485 void DPSOFTRAST_VertexShader_Lightmap(void)
3487 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3488 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3489 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3492 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3495 unsigned char * RESTRICT pixelmask = span->pixelmask;
3496 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3497 int x, startx = span->startx, endx = span->endx;
3498 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3499 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3500 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3501 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3502 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3503 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3504 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3505 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3506 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3507 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3508 pixel = buffer_FragColorbgra8;
3509 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3510 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3511 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3512 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3513 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3514 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3515 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3516 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3518 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3519 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3520 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3521 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3522 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3523 for (x = startx;x < endx;x++)
3525 __m128i color, lightmap, glow, pix;
3526 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3529 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3530 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3531 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3532 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3533 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3534 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3535 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3536 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3537 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3538 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3544 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3545 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3546 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3547 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3548 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3549 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3554 for (x = startx;x < endx;x++)
3556 __m128i color, lightmap, pix;
3557 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3560 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3561 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3562 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3563 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3564 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3565 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3566 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3572 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3573 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3574 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3575 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3578 if (pixel == buffer_FragColorbgra8)
3579 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3584 void DPSOFTRAST_VertexShader_LightDirection(void);
3585 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3587 void DPSOFTRAST_VertexShader_FakeLight(void)
3589 DPSOFTRAST_VertexShader_LightDirection();
3592 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3594 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3599 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3601 DPSOFTRAST_VertexShader_LightDirection();
3602 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3605 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3607 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3612 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3614 DPSOFTRAST_VertexShader_LightDirection();
3615 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3618 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3620 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3625 void DPSOFTRAST_VertexShader_LightDirection(void)
3628 int numvertices = dpsoftrast.numvertices;
3630 float LightVector[4];
3631 float EyePosition[4];
3632 float EyeVectorModelSpace[4];
3638 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3639 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3640 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3641 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3642 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3643 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3644 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3645 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3646 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3647 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3648 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3649 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3650 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3651 for (i = 0;i < numvertices;i++)
3653 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3654 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3655 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3656 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3657 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3658 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3659 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3660 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3661 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3662 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3663 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3664 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3665 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3666 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3667 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3668 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3669 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3670 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3671 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3672 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3673 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3674 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3675 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3676 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3677 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3678 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3679 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3680 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3681 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3683 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3686 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3687 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3688 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3689 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3690 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3691 #define DPSOFTRAST_Vector3Normalize(v)\
3694 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3705 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3707 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3708 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3709 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3710 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3711 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3712 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3713 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3714 unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3715 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3716 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3717 int x, startx = span->startx, endx = span->endx;
3718 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3719 float LightVectordata[4];
3720 float LightVectorslope[4];
3721 float EyeVectordata[4];
3722 float EyeVectorslope[4];
3723 float VectorSdata[4];
3724 float VectorSslope[4];
3725 float VectorTdata[4];
3726 float VectorTslope[4];
3727 float VectorRdata[4];
3728 float VectorRslope[4];
3730 float diffusetex[4];
3732 float surfacenormal[4];
3733 float lightnormal[4];
3734 float lightnormal_modelspace[4];
3736 float specularnormal[4];
3739 float SpecularPower;
3741 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3742 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3743 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3744 Color_Glow[3] = 0.0f;
3745 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3746 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3747 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3748 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3749 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3750 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3751 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3752 Color_Pants[3] = 0.0f;
3753 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3754 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3755 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3756 Color_Shirt[3] = 0.0f;
3757 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3758 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3759 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3761 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3762 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3764 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3766 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3768 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3770 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3771 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3772 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3773 Color_Diffuse[3] = 0.0f;
3774 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3775 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3776 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3777 LightColor[3] = 0.0f;
3778 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3779 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3780 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3781 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3782 Color_Specular[3] = 0.0f;
3783 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3784 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3785 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3787 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3789 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3790 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3791 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3792 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3793 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3795 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3797 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3798 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3800 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3802 // nothing of this needed
3806 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3809 for (x = startx;x < endx;x++)
3812 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3813 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3814 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3815 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3816 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3818 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3819 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3820 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3821 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3823 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3824 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3825 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3826 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3827 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3828 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3829 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3830 DPSOFTRAST_Vector3Normalize(surfacenormal);
3832 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3834 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3835 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3836 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3837 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3839 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3840 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3841 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3842 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3844 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3845 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3846 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3847 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3849 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3850 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3851 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3852 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3854 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3855 DPSOFTRAST_Vector3Normalize(lightnormal);
3857 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3859 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3860 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3861 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3862 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3865 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3867 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3868 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3869 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3871 float f = 1.0f / 256.0f;
3872 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3873 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3874 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3877 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3879 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3880 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3881 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3882 DPSOFTRAST_Vector3Normalize(lightnormal);
3884 LightColor[0] = 1.0;
3885 LightColor[1] = 1.0;
3886 LightColor[2] = 1.0;
3890 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3891 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3892 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3893 DPSOFTRAST_Vector3Normalize(lightnormal);
3896 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3898 if(thread->shader_exactspecularmath)
3900 // reflect lightnormal at surfacenormal, take the negative of that
3901 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3903 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3904 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3905 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3906 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3908 // dot of this and normalize(EyeVectorFogDepth.xyz)
3909 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3910 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3911 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3912 DPSOFTRAST_Vector3Normalize(eyenormal);
3914 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3918 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3919 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3920 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3921 DPSOFTRAST_Vector3Normalize(eyenormal);
3923 specularnormal[0] = lightnormal[0] + eyenormal[0];
3924 specularnormal[1] = lightnormal[1] + eyenormal[1];
3925 specularnormal[2] = lightnormal[2] + eyenormal[2];
3926 DPSOFTRAST_Vector3Normalize(specularnormal);
3928 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3931 specular = pow(specular, SpecularPower * glosstex[3]);
3932 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3934 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3935 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3936 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3937 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3941 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3942 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3943 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3944 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3947 buffer_FragColorbgra8[x*4+0] = d[0];
3948 buffer_FragColorbgra8[x*4+1] = d[1];
3949 buffer_FragColorbgra8[x*4+2] = d[2];
3950 buffer_FragColorbgra8[x*4+3] = d[3];
3953 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3955 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3956 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3957 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3958 Color_Diffuse[3] = 0.0f;
3959 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3960 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3961 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3962 LightColor[3] = 0.0f;
3963 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3965 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3967 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3968 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3969 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3970 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3971 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3973 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3975 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3976 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3978 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3980 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3984 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3987 for (x = startx;x < endx;x++)
3990 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3991 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3992 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3993 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3994 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3995 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3996 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3997 DPSOFTRAST_Vector3Normalize(surfacenormal);
3999 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
4001 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
4002 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4003 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4004 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4006 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
4007 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
4008 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
4009 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
4011 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
4012 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
4013 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
4014 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
4016 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
4017 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
4018 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
4019 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
4021 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
4022 DPSOFTRAST_Vector3Normalize(lightnormal);
4024 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
4026 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
4027 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
4028 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
4029 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
4032 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
4034 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4035 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4036 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4038 float f = 1.0f / 256.0f;
4039 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
4040 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
4041 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
4044 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
4046 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4047 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4048 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4049 DPSOFTRAST_Vector3Normalize(lightnormal);
4051 LightColor[0] = 1.0;
4052 LightColor[1] = 1.0;
4053 LightColor[2] = 1.0;
4057 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4058 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4059 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4060 DPSOFTRAST_Vector3Normalize(lightnormal);
4063 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4064 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4066 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4067 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4068 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4069 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4073 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4074 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4075 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4076 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
4078 buffer_FragColorbgra8[x*4+0] = d[0];
4079 buffer_FragColorbgra8[x*4+1] = d[1];
4080 buffer_FragColorbgra8[x*4+2] = d[2];
4081 buffer_FragColorbgra8[x*4+3] = d[3];
4086 for (x = startx;x < endx;x++)
4089 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4090 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4091 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4092 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4094 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4096 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4097 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4098 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4099 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4103 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4104 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4105 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4106 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4108 buffer_FragColorbgra8[x*4+0] = d[0];
4109 buffer_FragColorbgra8[x*4+1] = d[1];
4110 buffer_FragColorbgra8[x*4+2] = d[2];
4111 buffer_FragColorbgra8[x*4+3] = d[3];
4114 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4119 void DPSOFTRAST_VertexShader_LightSource(void)
4122 int numvertices = dpsoftrast.numvertices;
4123 float LightPosition[4];
4124 float LightVector[4];
4125 float LightVectorModelSpace[4];
4126 float EyePosition[4];
4127 float EyeVectorModelSpace[4];
4133 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4134 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4135 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4136 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4137 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4138 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4139 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4140 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4141 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4142 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4143 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4144 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4145 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4146 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4147 for (i = 0;i < numvertices;i++)
4149 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4150 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4151 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4152 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4153 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4154 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4155 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4156 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4157 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4158 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4159 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4160 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4161 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4162 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4163 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4164 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4165 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4166 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
4167 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4168 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4169 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4170 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4171 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4172 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4173 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4174 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4175 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4176 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4177 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4178 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4179 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4180 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4182 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4183 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4186 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4189 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4190 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4191 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4192 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4193 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4194 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4195 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4196 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4197 int x, startx = span->startx, endx = span->endx;
4198 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4199 float CubeVectordata[4];
4200 float CubeVectorslope[4];
4201 float LightVectordata[4];
4202 float LightVectorslope[4];
4203 float EyeVectordata[4];
4204 float EyeVectorslope[4];
4206 float diffusetex[4];
4208 float surfacenormal[4];
4209 float lightnormal[4];
4211 float specularnormal[4];
4214 float SpecularPower;
4215 float CubeVector[4];
4218 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4219 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4220 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4221 Color_Glow[3] = 0.0f;
4222 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4223 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4224 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4225 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4226 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4227 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4228 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4229 Color_Diffuse[3] = 0.0f;
4230 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4231 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4232 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4233 Color_Specular[3] = 0.0f;
4234 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4235 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4236 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4237 Color_Pants[3] = 0.0f;
4238 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4239 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4240 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4241 Color_Shirt[3] = 0.0f;
4242 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4243 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4244 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4245 LightColor[3] = 0.0f;
4246 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4247 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4248 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4249 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4250 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4251 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4252 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4253 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4255 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4256 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4258 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4259 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4260 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4262 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4263 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4264 for (x = startx;x < endx;x++)
4267 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4268 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4269 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4270 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4271 if (attenuation < 0.01f)
4273 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4275 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4276 if (attenuation < 0.01f)
4280 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4281 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4282 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4283 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4284 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4286 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4287 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4288 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4289 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4291 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4292 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4293 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4294 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4295 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4296 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4297 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4298 DPSOFTRAST_Vector3Normalize(surfacenormal);
4300 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4301 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4302 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4303 DPSOFTRAST_Vector3Normalize(lightnormal);
4305 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4307 if(thread->shader_exactspecularmath)
4309 // reflect lightnormal at surfacenormal, take the negative of that
4310 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4312 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4313 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4314 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4315 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4317 // dot of this and normalize(EyeVectorFogDepth.xyz)
4318 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4319 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4320 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4321 DPSOFTRAST_Vector3Normalize(eyenormal);
4323 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4327 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4328 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4329 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4330 DPSOFTRAST_Vector3Normalize(eyenormal);
4332 specularnormal[0] = lightnormal[0] + eyenormal[0];
4333 specularnormal[1] = lightnormal[1] + eyenormal[1];
4334 specularnormal[2] = lightnormal[2] + eyenormal[2];
4335 DPSOFTRAST_Vector3Normalize(specularnormal);
4337 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4339 specular = pow(specular, SpecularPower * glosstex[3]);
4341 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4343 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4344 attenuation *= (1.0f / 255.0f);
4345 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4346 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4347 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4348 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4352 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4353 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4354 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4355 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4357 buffer_FragColorbgra8[x*4+0] = d[0];
4358 buffer_FragColorbgra8[x*4+1] = d[1];
4359 buffer_FragColorbgra8[x*4+2] = d[2];
4360 buffer_FragColorbgra8[x*4+3] = d[3];
4363 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4365 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4366 for (x = startx;x < endx;x++)
4369 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4370 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4371 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4372 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4373 if (attenuation < 0.01f)
4375 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4377 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4378 if (attenuation < 0.01f)
4382 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4383 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4384 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4385 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4386 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4388 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4389 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4390 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4391 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4393 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4394 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4395 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4396 DPSOFTRAST_Vector3Normalize(surfacenormal);
4398 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4399 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4400 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4401 DPSOFTRAST_Vector3Normalize(lightnormal);
4403 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4404 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4406 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4407 attenuation *= (1.0f / 255.0f);
4408 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4409 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4410 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4411 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4415 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4416 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4417 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4418 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4420 buffer_FragColorbgra8[x*4+0] = d[0];
4421 buffer_FragColorbgra8[x*4+1] = d[1];
4422 buffer_FragColorbgra8[x*4+2] = d[2];
4423 buffer_FragColorbgra8[x*4+3] = d[3];
4428 for (x = startx;x < endx;x++)
4431 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4432 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4433 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4434 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4435 if (attenuation < 0.01f)
4437 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4439 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4440 if (attenuation < 0.01f)
4444 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4445 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4446 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4447 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4448 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4450 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4451 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4452 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4453 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4455 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4457 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4458 attenuation *= (1.0f / 255.0f);
4459 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4460 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4461 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4462 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4466 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4467 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4468 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4469 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4471 buffer_FragColorbgra8[x*4+0] = d[0];
4472 buffer_FragColorbgra8[x*4+1] = d[1];
4473 buffer_FragColorbgra8[x*4+2] = d[2];
4474 buffer_FragColorbgra8[x*4+3] = d[3];
4477 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4483 void DPSOFTRAST_VertexShader_Refraction(void)
4485 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4486 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4487 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4490 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4492 // DIRTY TRICK: only do sideways displacement. Not correct, but cheaper and thus better for SW.
4494 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4496 int x, startx = span->startx, endx = span->endx;
4499 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4500 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4503 float ModelViewProjectionPositiondata[4];
4504 float ModelViewProjectionPositionslope[4];
4507 float ScreenScaleRefractReflect[2];
4508 float ScreenCenterRefractReflect[2];
4509 float DistortScaleRefractReflect[2];
4510 float RefractColor[4];
4512 const unsigned char * RESTRICT pixelbase;
4513 const unsigned char * RESTRICT pixel[4];
4514 DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4515 if(!texture) return;
4516 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[0][0];
4519 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4520 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4523 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
4526 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4527 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4528 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4529 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4530 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4531 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4532 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4533 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4534 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4535 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4538 for (x = startx;x < endx;x++)
4540 float SafeScreenTexCoord[2];
4541 float ScreenTexCoord[2];
4548 // " vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4549 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4551 // " vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4552 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4553 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4555 // " vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4556 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4557 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4558 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4559 DPSOFTRAST_Vector3Normalize(v);
4560 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4561 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4563 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4564 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4566 unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<12) - 2048, ScreenTexCoord[1] * (texture->mipmap[0][3]<<12) - 2048};
4567 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
4568 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
4569 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
4570 int tci[2] = { tc[0]>>12, tc[1]>>12 };
4571 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4572 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4573 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4574 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4575 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4576 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4577 pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci1[0]);
4578 pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci[0]);
4579 pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci1[0]);
4580 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
4581 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
4582 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
4586 int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2], ScreenTexCoord[1] * texture->mipmap[0][3] };
4587 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4588 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4589 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4595 //p = (int) bound(startx, x + (ScreenTexCoord[0] - SafeScreenTexCoord[0]) / (ModelViewProjectionPositionslope[0]*z), endx-1);
4596 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4597 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4598 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4599 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4602 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4607 void DPSOFTRAST_VertexShader_Water(void)
4609 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4613 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4616 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4617 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4618 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4619 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4620 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4625 void DPSOFTRAST_VertexShader_ShowDepth(void)
4627 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4630 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4633 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4634 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4635 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4636 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4637 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4642 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4644 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4647 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4650 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4651 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4652 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4653 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4654 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4659 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4661 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4664 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4667 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4668 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4669 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4670 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4671 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4676 typedef struct DPSOFTRAST_ShaderModeInfo_s
4679 void (*Vertex)(void);
4680 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4681 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4682 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4684 DPSOFTRAST_ShaderModeInfo;
4686 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4688 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4689 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4690 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4691 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4692 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4693 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4694 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4695 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4696 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4697 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4698 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4699 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4700 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4701 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4702 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4703 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4706 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4713 // unsigned int *colorpixel;
4714 unsigned int *depthpixel;
4720 DPSOFTRAST_State_Triangle *triangle;
4721 DPSOFTRAST_State_Span *span;
4722 unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
4723 for (i = 0; i < thread->numspans; i++)
4725 span = &thread->spans[i];
4726 triangle = &thread->triangles[span->triangle];
4727 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4729 wslope = triangle->w[0];
4730 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4731 depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4732 depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4733 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4734 startx = span->startx;
4736 switch(thread->fb_depthfunc)
4739 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4740 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4741 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4742 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4743 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4744 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4745 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4747 //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4748 //for (x = startx;x < endx;x++)
4749 // colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4750 // if there is no color buffer, skip pixel shader
4751 while (startx < endx && !pixelmask[startx])
4753 while (endx > startx && !pixelmask[endx-1])
4756 continue; // no pixels to fill
4757 span->pixelmask = pixelmask;
4758 span->startx = startx;
4760 // run pixel shader if appropriate
4761 // do this before running depthmask code, to allow the pixelshader
4762 // to clear pixelmask values for alpha testing
4763 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4764 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4765 if (thread->depthmask)
4766 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4772 // no depth testing means we're just dealing with color...
4773 // if there is no color buffer, skip pixel shader
4774 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4776 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4777 span->pixelmask = pixelmask;
4778 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4782 thread->numspans = 0;
4785 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4787 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4790 int cullface = thread->cullface;
4791 int minx, maxx, miny, maxy;
4792 int miny1, maxy1, miny2, maxy2;
4793 __m128i fbmin, fbmax;
4794 __m128 viewportcenter, viewportscale;
4795 int firstvertex = command->firstvertex;
4796 int numvertices = command->numvertices;
4797 int numtriangles = command->numtriangles;
4798 const int *element3i = command->element3i;
4799 const unsigned short *element3s = command->element3s;
4800 int clipped = command->clipped;
4807 int starty, endy, bandy;
4811 float clip0origin, clip0slope;
4813 __m128 triangleedge1, triangleedge2, trianglenormal;
4816 DPSOFTRAST_State_Triangle *triangle;
4817 DPSOFTRAST_Texture *texture;
4818 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4819 miny = thread->fb_scissor[1];
4820 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4821 miny1 = bound(miny, thread->miny1, maxy);
4822 maxy1 = bound(miny, thread->maxy1, maxy);
4823 miny2 = bound(miny, thread->miny2, maxy);
4824 maxy2 = bound(miny, thread->maxy2, maxy);
4825 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4827 if (!ATOMIC_DECREMENT(command->refcount))
4829 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4830 MM_FREE(command->arrays);
4834 minx = thread->fb_scissor[0];
4835 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4836 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4837 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4838 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4839 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4840 screen[3] = _mm_setzero_ps();
4841 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4842 for (i = 0;i < numtriangles;i++)
4844 const float *screencoord4f = command->arrays;
4845 const float *arrays = screencoord4f + numvertices*4;
4847 // generate the 3 edges of this triangle
4848 // generate spans for the triangle - switch based on left split or right split classification of triangle
4851 e[0] = element3s[i*3+0] - firstvertex;
4852 e[1] = element3s[i*3+1] - firstvertex;
4853 e[2] = element3s[i*3+2] - firstvertex;
4857 e[0] = element3i[i*3+0] - firstvertex;
4858 e[1] = element3i[i*3+1] - firstvertex;
4859 e[2] = element3i[i*3+2] - firstvertex;
4868 #define SKIPBACKFACE \
4869 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4870 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4871 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4872 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4873 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4877 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4881 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4886 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4887 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4889 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4890 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4892 #define CLIPPEDVERTEXCOPY(k,p1) \
4893 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4895 #define GENATTRIBCOPY(attrib, p1) \
4896 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4897 #define GENATTRIBLERP(attrib, p1, p2) \
4899 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4900 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4902 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4906 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4907 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4908 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4909 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4910 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4911 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4912 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4918 // calculate distance from nearplane
4919 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4920 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4921 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4922 if (clipdist[0] >= 0.0f)
4924 if (clipdist[1] >= 0.0f)
4926 if (clipdist[2] >= 0.0f)
4929 // triangle is entirely in front of nearplane
4930 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4937 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4945 if (clipdist[2] >= 0.0f)
4947 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4954 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4961 else if (clipdist[1] >= 0.0f)
4963 if (clipdist[2] >= 0.0f)
4965 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4972 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4978 else if (clipdist[2] >= 0.0f)
4980 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4985 else continue; // triangle is entirely behind nearplane
4988 // calculate integer y coords for triangle points
4989 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4990 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4991 screenmin = _mm_min_epi16(screeni, screenir),
4992 screenmax = _mm_max_epi16(screeni, screenir);
4993 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4994 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4995 screenmin = _mm_max_epi16(screenmin, fbmin);
4996 screenmax = _mm_min_epi16(screenmax, fbmax);
4997 // skip offscreen triangles
4998 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5000 starty = _mm_extract_epi16(screenmin, 1);
5001 endy = _mm_extract_epi16(screenmax, 1)+1;
5002 if (starty >= maxy1 && endy <= miny2)
5004 screeny = _mm_srai_epi32(screeni, 16);
5007 triangle = &thread->triangles[thread->numtriangles];
5009 // calculate attribute plans for triangle data...
5010 // okay, this triangle is going to produce spans, we'd better project
5011 // the interpolants now (this is what gives perspective texturing),
5012 // this consists of simply multiplying all arrays by the W coord
5013 // (which is basically 1/Z), which will be undone per-pixel
5014 // (multiplying by Z again) to get the perspective-correct array
5017 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5018 __m128 mipedgescale, mipdensity;
5019 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5020 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5021 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5022 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5023 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5024 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5025 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5026 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5027 attribedge1 = _mm_sub_ss(w0, w1);
5028 attribedge2 = _mm_sub_ss(w2, w1);
5029 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5030 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5031 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5032 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5033 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5034 _mm_store_ss(&triangle->w[0], attribxslope);
5035 _mm_store_ss(&triangle->w[1], attribyslope);
5036 _mm_store_ss(&triangle->w[2], attriborigin);
5041 if(thread->clipplane[0] || thread->clipplane[1] || thread->clipplane[2])
5043 float cliporigin, clipxslope, clipyslope;
5044 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5045 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5046 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5047 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5048 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5049 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5050 cliporigin = _mm_cvtss_f32(attriborigin)*thread->clipplane[2] + thread->clipplane[3];
5051 clipxslope = thread->clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->clipplane[2];
5052 clipyslope = thread->clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->clipplane[2];
5055 clip0origin = -cliporigin/clipxslope;
5056 clip0slope = -clipyslope/clipxslope;
5057 clip0dir = clipxslope > 0 ? 1 : -1;
5059 else if(clipyslope > 0)
5061 clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5062 clip0slope = dpsoftrast.fb_width;
5065 else if(clipyslope < 0)
5067 clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5068 clip0slope = -dpsoftrast.fb_width;
5071 else if(clip0origin < 0) continue;
5074 mipedgescale = _mm_setzero_ps();
5075 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5077 __m128 attrib0, attrib1, attrib2;
5078 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5079 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5081 arrays += numvertices*4;
5082 GENATTRIBS(attrib0, attrib1, attrib2);
5083 attriborigin = _mm_mul_ps(attrib1, w1);
5084 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5085 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5086 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5087 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5088 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5089 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5090 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5091 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5092 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5094 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5095 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5096 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5097 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5101 memset(triangle->mip, 0, sizeof(triangle->mip));
5102 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5104 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5105 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5107 texture = thread->texbound[texunit];
5108 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5110 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5111 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5112 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5113 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5114 // this will be multiplied in the texturing routine by the texture resolution
5115 y = _mm_cvtss_si32(mipdensity);
5118 y = (int)(log((float)y)*0.5f/M_LN2);
5119 if (y > texture->mipmaps - 1)
5120 y = texture->mipmaps - 1;
5121 triangle->mip[texunit] = y;
5127 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5130 __m128 xcoords, xslope;
5131 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5132 int yccmask = _mm_movemask_epi8(ycc);
5133 int edge0p, edge0n, edge1p, edge1n;
5141 case 0xFFFF: /*0000*/ y = endy; continue;
5142 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5143 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5144 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5145 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5146 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5147 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5148 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5149 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5150 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5151 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5152 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5153 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5154 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5155 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5156 case 0x0000: /*1111*/ y++; continue;
5164 case 0xFFFF: /*000*/ y = endy; continue;
5165 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5166 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5167 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5168 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5169 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5170 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5171 case 0x0000: /*111*/ y++; continue;
5174 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5175 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5176 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5177 nexty = _mm_extract_epi16(ycc, 0);
5178 if (nexty >= bandy) nexty = bandy-1;
5179 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5180 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5181 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5182 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5183 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5184 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5186 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5187 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5189 clip0 = clip0origin + (y+0.5f)*clip0slope;
5190 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5192 int startx, endx, clipx = minx, offset;
5193 startx = _mm_cvtss_si32(xcoords);
5194 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5197 if (startx < 0) startx = 0;
5198 startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
5200 if (endx > maxx) endx = maxx;
5201 if (startx >= endx) continue;
5209 if(endx <= clip0) continue;
5210 clipx = max((int)clip0, minx);
5211 startx += (clipx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
5214 else if (endx > clip0)
5216 if(startx >= clip0) continue;
5221 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5223 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5224 span->triangle = thread->numtriangles;
5227 span->startx = max(clipx - offset, 0);
5228 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5229 if (span->startx >= span->endx)
5231 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5232 DPSOFTRAST_Draw_ProcessSpans(thread);
5237 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5239 DPSOFTRAST_Draw_ProcessSpans(thread);
5240 thread->numtriangles = 0;
5244 if (!ATOMIC_DECREMENT(command->refcount))
5246 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5247 MM_FREE(command->arrays);
5250 if (thread->numspans > 0 || thread->numtriangles > 0)
5252 DPSOFTRAST_Draw_ProcessSpans(thread);
5253 thread->numtriangles = 0;
5258 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5262 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5263 int datasize = 2*numvertices*sizeof(float[4]);
5264 DPSOFTRAST_Command_Draw *command;
5265 unsigned char *data;
5266 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5268 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5269 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5271 datasize += numvertices*sizeof(float[4]);
5274 datasize += numtriangles*sizeof(unsigned short[3]);
5276 datasize += numtriangles*sizeof(int[3]);
5277 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5278 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5280 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5281 data = (unsigned char *)MM_CALLOC(datasize, 1);
5285 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5286 data = (unsigned char *)command + commandsize;
5288 command->firstvertex = firstvertex;
5289 command->numvertices = numvertices;
5290 command->numtriangles = numtriangles;
5291 command->arrays = (float *)data;
5292 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5293 dpsoftrast.firstvertex = firstvertex;
5294 dpsoftrast.numvertices = numvertices;
5295 dpsoftrast.screencoord4f = (float *)data;
5296 data += numvertices*sizeof(float[4]);
5297 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5298 data += numvertices*sizeof(float[4]);
5299 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5301 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5302 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5304 dpsoftrast.post_array4f[j] = (float *)data;
5305 data += numvertices*sizeof(float[4]);
5307 command->element3i = NULL;
5308 command->element3s = NULL;
5311 command->element3s = (unsigned short *)data;
5312 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5316 command->element3i = (int *)data;
5317 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5322 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5324 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5325 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5326 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5327 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5328 if (command->starty >= command->endy)
5330 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5331 MM_FREE(command->arrays);
5332 DPSOFTRAST_UndoCommand(command->commandsize);
5335 command->clipped = dpsoftrast.drawclipped;
5336 command->refcount = dpsoftrast.numthreads;
5338 if (dpsoftrast.usethreads)
5341 DPSOFTRAST_Draw_SyncCommands();
5342 for (i = 0; i < dpsoftrast.numthreads; i++)
5344 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5345 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5346 Thread_CondSignal(thread->drawcond);
5351 DPSOFTRAST_Draw_FlushThreads();
5355 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5356 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5358 thread->validate |= DPSOFTRAST_VALIDATE_FB;
5360 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5362 DPSOFTRAST_Command_SetRenderTargets *command;
5363 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5364 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5365 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5367 dpsoftrast.fb_width = width;
5368 dpsoftrast.fb_height = height;
5369 dpsoftrast.fb_depthpixels = depthpixels;
5370 dpsoftrast.fb_colorpixels[0] = colorpixels0;
5371 dpsoftrast.fb_colorpixels[1] = colorpixels1;
5372 dpsoftrast.fb_colorpixels[2] = colorpixels2;
5373 dpsoftrast.fb_colorpixels[3] = colorpixels3;
5374 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5375 command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5376 command->width = width;
5377 command->height = height;
5380 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5382 int commandoffset = thread->commandoffset;
5383 while (commandoffset != endoffset)
5385 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5386 switch (command->opcode)
5388 #define INTERPCOMMAND(name) \
5389 case DPSOFTRAST_OPCODE_##name : \
5390 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5391 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5392 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5393 commandoffset = 0; \
5395 INTERPCOMMAND(Viewport)
5396 INTERPCOMMAND(ClearColor)
5397 INTERPCOMMAND(ClearDepth)
5398 INTERPCOMMAND(ColorMask)
5399 INTERPCOMMAND(DepthTest)
5400 INTERPCOMMAND(ScissorTest)
5401 INTERPCOMMAND(Scissor)
5402 INTERPCOMMAND(BlendFunc)
5403 INTERPCOMMAND(BlendSubtract)
5404 INTERPCOMMAND(DepthMask)
5405 INTERPCOMMAND(DepthFunc)
5406 INTERPCOMMAND(DepthRange)
5407 INTERPCOMMAND(PolygonOffset)
5408 INTERPCOMMAND(CullFace)
5409 INTERPCOMMAND(AlphaTest)
5410 INTERPCOMMAND(AlphaFunc)
5411 INTERPCOMMAND(SetTexture)
5412 INTERPCOMMAND(SetShader)
5413 INTERPCOMMAND(Uniform4f)
5414 INTERPCOMMAND(UniformMatrix4f)
5415 INTERPCOMMAND(Uniform1i)
5416 INTERPCOMMAND(SetRenderTargets)
5417 INTERPCOMMAND(ClipPlane)
5419 case DPSOFTRAST_OPCODE_Draw:
5420 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5421 commandoffset += command->commandsize;
5422 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5424 thread->commandoffset = commandoffset;
5427 case DPSOFTRAST_OPCODE_Reset:
5432 thread->commandoffset = commandoffset;
5435 static int DPSOFTRAST_Draw_Thread(void *data)
5437 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5438 while(thread->index >= 0)
5440 if (thread->commandoffset != dpsoftrast.drawcommand)
5442 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5446 Thread_LockMutex(thread->drawmutex);
5447 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5449 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5450 thread->starving = true;
5451 Thread_CondWait(thread->drawcond, thread->drawmutex);
5452 thread->starving = false;
5454 Thread_UnlockMutex(thread->drawmutex);
5460 static void DPSOFTRAST_Draw_FlushThreads(void)
5462 DPSOFTRAST_State_Thread *thread;
5464 DPSOFTRAST_Draw_SyncCommands();
5465 if (dpsoftrast.usethreads)
5467 for (i = 0; i < dpsoftrast.numthreads; i++)
5469 thread = &dpsoftrast.threads[i];
5470 if (thread->commandoffset != dpsoftrast.drawcommand)
5472 Thread_LockMutex(thread->drawmutex);
5473 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5474 Thread_CondSignal(thread->drawcond);
5475 Thread_UnlockMutex(thread->drawmutex);
5478 for (i = 0; i < dpsoftrast.numthreads; i++)
5480 thread = &dpsoftrast.threads[i];
5481 if (thread->commandoffset != dpsoftrast.drawcommand)
5483 Thread_LockMutex(thread->drawmutex);
5484 if (thread->commandoffset != dpsoftrast.drawcommand)
5486 thread->waiting = true;
5487 Thread_CondWait(thread->waitcond, thread->drawmutex);
5488 thread->waiting = false;
5490 Thread_UnlockMutex(thread->drawmutex);
5496 for (i = 0; i < dpsoftrast.numthreads; i++)
5498 thread = &dpsoftrast.threads[i];
5499 if (thread->commandoffset != dpsoftrast.drawcommand)
5500 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5503 dpsoftrast.commandpool.usedcommands = 0;
5506 void DPSOFTRAST_Flush(void)
5508 DPSOFTRAST_Draw_FlushThreads();
5511 void DPSOFTRAST_Finish(void)
5516 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5526 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5527 dpsoftrast.bigendian = u.b[3];
5528 dpsoftrast.fb_width = width;
5529 dpsoftrast.fb_height = height;
5530 dpsoftrast.fb_depthpixels = depthpixels;
5531 dpsoftrast.fb_colorpixels[0] = colorpixels;
5532 dpsoftrast.fb_colorpixels[1] = NULL;
5533 dpsoftrast.fb_colorpixels[1] = NULL;
5534 dpsoftrast.fb_colorpixels[1] = NULL;
5535 dpsoftrast.viewport[0] = 0;
5536 dpsoftrast.viewport[1] = 0;
5537 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5538 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5539 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5540 dpsoftrast.texture_firstfree = 1;
5541 dpsoftrast.texture_end = 1;
5542 dpsoftrast.texture_max = 0;
5543 dpsoftrast.color[0] = 1;
5544 dpsoftrast.color[1] = 1;
5545 dpsoftrast.color[2] = 1;
5546 dpsoftrast.color[3] = 1;
5547 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5548 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5549 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5550 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5551 for (i = 0; i < dpsoftrast.numthreads; i++)
5553 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5555 thread->cullface = GL_BACK;
5556 thread->colormask[1] = 1;
5557 thread->colormask[2] = 1;
5558 thread->colormask[3] = 1;
5559 thread->blendfunc[0] = GL_ONE;
5560 thread->blendfunc[1] = GL_ZERO;
5561 thread->depthmask = true;
5562 thread->depthtest = true;
5563 thread->depthfunc = GL_LEQUAL;
5564 thread->scissortest = false;
5565 thread->alphatest = false;
5566 thread->alphafunc = GL_GREATER;
5567 thread->alphavalue = 0.5f;
5568 thread->viewport[0] = 0;
5569 thread->viewport[1] = 0;
5570 thread->viewport[2] = dpsoftrast.fb_width;
5571 thread->viewport[3] = dpsoftrast.fb_height;
5572 thread->scissor[0] = 0;
5573 thread->scissor[1] = 0;
5574 thread->scissor[2] = dpsoftrast.fb_width;
5575 thread->scissor[3] = dpsoftrast.fb_height;
5576 thread->depthrange[0] = 0;
5577 thread->depthrange[1] = 1;
5578 thread->polygonoffset[0] = 0;
5579 thread->polygonoffset[1] = 0;
5580 thread->clipplane[0] = 0;
5581 thread->clipplane[1] = 0;
5582 thread->clipplane[2] = 0;
5583 thread->clipplane[3] = 1;
5585 DPSOFTRAST_RecalcThread(thread);
5587 thread->numspans = 0;
5588 thread->numtriangles = 0;
5589 thread->commandoffset = 0;
5590 thread->waiting = false;
5591 thread->starving = false;
5593 thread->validate = -1;
5594 DPSOFTRAST_Validate(thread, -1);
5596 if (dpsoftrast.usethreads)
5598 thread->waitcond = Thread_CreateCond();
5599 thread->drawcond = Thread_CreateCond();
5600 thread->drawmutex = Thread_CreateMutex();
5601 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5607 void DPSOFTRAST_Shutdown(void)
5610 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5612 DPSOFTRAST_State_Thread *thread;
5613 for (i = 0; i < dpsoftrast.numthreads; i++)
5615 thread = &dpsoftrast.threads[i];
5616 Thread_LockMutex(thread->drawmutex);
5618 Thread_CondSignal(thread->drawcond);
5619 Thread_UnlockMutex(thread->drawmutex);
5620 Thread_WaitThread(thread->thread, 0);
5621 Thread_DestroyCond(thread->waitcond);
5622 Thread_DestroyCond(thread->drawcond);
5623 Thread_DestroyMutex(thread->drawmutex);
5626 for (i = 0;i < dpsoftrast.texture_end;i++)
5627 if (dpsoftrast.texture[i].bytes)
5628 MM_FREE(dpsoftrast.texture[i].bytes);
5629 if (dpsoftrast.texture)
5630 free(dpsoftrast.texture);
5631 if (dpsoftrast.threads)
5632 MM_FREE(dpsoftrast.threads);
5633 memset(&dpsoftrast, 0, sizeof(dpsoftrast));