3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 typedef qboolean bool;
14 #define ATOMIC_SIZE 32
17 #if defined(__APPLE__)
18 #include <libkern/OSAtomic.h>
19 #define ALIGN(var) var __attribute__((__aligned__(16)))
20 #define ATOMIC(var) var __attribute__((__aligned__(32)))
21 #define MEMORY_BARRIER (_mm_sfence())
22 #define ATOMIC_COUNTER volatile int32_t
23 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
24 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
25 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
26 #elif defined(__GNUC__)
27 #define ALIGN(var) var __attribute__((__aligned__(16)))
28 #define ATOMIC(var) var __attribute__((__aligned__(32)))
29 #define MEMORY_BARRIER (_mm_sfence())
30 //(__sync_synchronize())
31 #define ATOMIC_COUNTER volatile int
32 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
33 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
34 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
35 #elif defined(_MSC_VER)
36 #define ALIGN(var) __declspec(align(16)) var
37 #define ATOMIC(var) __declspec(align(32)) var
38 #define MEMORY_BARRIER (_mm_sfence())
40 #define ATOMIC_COUNTER volatile LONG
41 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
42 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
43 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
48 #define ALIGN(var) var
51 #define ATOMIC(var) var
53 #ifndef MEMORY_BARRIER
54 #define MEMORY_BARRIER ((void)0)
56 #ifndef ATOMIC_COUNTER
57 #define ATOMIC_COUNTER int
59 #ifndef ATOMIC_INCREMENT
60 #define ATOMIC_INCREMENT(counter) (++(counter))
62 #ifndef ATOMIC_DECREMENT
63 #define ATOMIC_DECREMENT(counter) (--(counter))
66 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
70 #include <emmintrin.h>
72 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
74 static void *MM_CALLOC(size_t nmemb, size_t size)
76 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
77 if (ptr != NULL) memset(ptr, 0, nmemb*size);
81 #define MM_FREE _mm_free
83 #define MM_MALLOC(size) malloc(size)
84 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
88 typedef enum DPSOFTRAST_ARRAY_e
90 DPSOFTRAST_ARRAY_POSITION,
91 DPSOFTRAST_ARRAY_COLOR,
92 DPSOFTRAST_ARRAY_TEXCOORD0,
93 DPSOFTRAST_ARRAY_TEXCOORD1,
94 DPSOFTRAST_ARRAY_TEXCOORD2,
95 DPSOFTRAST_ARRAY_TEXCOORD3,
96 DPSOFTRAST_ARRAY_TEXCOORD4,
97 DPSOFTRAST_ARRAY_TEXCOORD5,
98 DPSOFTRAST_ARRAY_TEXCOORD6,
99 DPSOFTRAST_ARRAY_TEXCOORD7,
100 DPSOFTRAST_ARRAY_TOTAL
104 typedef struct DPSOFTRAST_Texture_s
111 DPSOFTRAST_TEXTURE_FILTER filter;
114 ATOMIC_COUNTER binds;
115 unsigned char *bytes;
116 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
120 #define COMMAND_SIZE ALIGN_SIZE
121 #define COMMAND_ALIGN(var) ALIGN(var)
123 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
125 unsigned char opcode;
126 unsigned short commandsize;
130 enum { DPSOFTRAST_OPCODE_Reset = 0 };
132 #define DEFCOMMAND(opcodeval, name, fields) \
133 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
134 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
136 unsigned char opcode; \
137 unsigned short commandsize; \
139 } DPSOFTRAST_Command_##name );
141 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
142 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
144 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
148 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
150 DPSOFTRAST_State_Command_Pool);
152 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
154 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
156 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
158 DPSOFTRAST_State_Triangle);
160 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
161 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
162 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
163 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
164 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
166 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
167 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
168 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
169 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
170 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
171 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
172 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
173 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
174 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
177 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
179 typedef ALIGN(struct DPSOFTRAST_State_Span_s
181 int triangle; // triangle this span was generated by
182 int x; // framebuffer x coord
183 int y; // framebuffer y coord
184 int startx; // usable range (according to pixelmask)
185 int endx; // usable range (according to pixelmask)
186 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
188 DPSOFTRAST_State_Span);
190 #define DPSOFTRAST_DRAW_MAXSPANS 1024
191 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
193 #define DPSOFTRAST_VALIDATE_FB 1
194 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
195 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
196 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
198 typedef enum DPSOFTRAST_BLENDMODE_e
200 DPSOFTRAST_BLENDMODE_OPAQUE,
201 DPSOFTRAST_BLENDMODE_ALPHA,
202 DPSOFTRAST_BLENDMODE_ADDALPHA,
203 DPSOFTRAST_BLENDMODE_ADD,
204 DPSOFTRAST_BLENDMODE_INVMOD,
205 DPSOFTRAST_BLENDMODE_MUL,
206 DPSOFTRAST_BLENDMODE_MUL2,
207 DPSOFTRAST_BLENDMODE_SUBALPHA,
208 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
209 DPSOFTRAST_BLENDMODE_INVADD,
210 DPSOFTRAST_BLENDMODE_TOTAL
212 DPSOFTRAST_BLENDMODE;
214 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
233 float polygonoffset[2];
236 int shader_permutation;
237 int shader_exactspecularmath;
239 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
241 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
242 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
244 // DPSOFTRAST_VALIDATE_ flags
247 // derived values (DPSOFTRAST_VALIDATE_FB)
250 ALIGN(float fb_viewportcenter[4]);
251 ALIGN(float fb_viewportscale[4]);
253 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
256 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
265 ATOMIC(volatile int commandoffset);
267 volatile bool waiting;
268 volatile bool starving;
275 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
276 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
278 DPSOFTRAST_State_Thread);
280 typedef ATOMIC(struct DPSOFTRAST_State_s
284 unsigned int *fb_depthpixels;
285 unsigned int *fb_colorpixels[4];
288 ALIGN(float fb_viewportcenter[4]);
289 ALIGN(float fb_viewportscale[4]);
292 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
293 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
295 const float *pointer_vertex3f;
296 const float *pointer_color4f;
297 const unsigned char *pointer_color4ub;
298 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
301 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
302 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
303 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
307 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
308 float *screencoord4f;
314 int shader_permutation;
315 int shader_exactspecularmath;
319 int texture_firstfree;
320 DPSOFTRAST_Texture *texture;
325 const char *errorstring;
330 DPSOFTRAST_State_Thread *threads;
332 ATOMIC(volatile int drawcommand);
334 DPSOFTRAST_State_Command_Pool commandpool;
338 DPSOFTRAST_State dpsoftrast;
340 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
341 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
342 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
343 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
344 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
346 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
348 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
349 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
350 fb_viewportcenter[3] = 0.5f;
351 fb_viewportcenter[0] = 0.0f;
352 fb_viewportscale[1] = 0.5f * viewport[2];
353 fb_viewportscale[2] = -0.5f * viewport[3];
354 fb_viewportscale[3] = 0.5f;
355 fb_viewportscale[0] = 1.0f;
358 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
360 if (dpsoftrast.interlace)
362 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
363 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
364 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
365 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
369 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
370 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
374 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
376 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
377 // and viewport projection values
380 x1 = thread->scissor[0];
381 x2 = thread->scissor[0] + thread->scissor[2];
382 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
383 y2 = dpsoftrast.fb_height - thread->scissor[1];
384 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
386 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
388 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
389 thread->fb_scissor[0] = x1;
390 thread->fb_scissor[1] = y1;
391 thread->fb_scissor[2] = x2 - x1;
392 thread->fb_scissor[3] = y2 - y1;
394 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
395 DPSOFTRAST_RecalcThread(thread);
398 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
400 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
403 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
405 if (thread->blendsubtract)
407 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
409 #define BLENDFUNC(sfactor, dfactor, blendmode) \
410 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
411 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
412 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
417 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
419 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
420 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
421 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
422 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
423 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
424 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
425 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
426 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
427 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
428 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
429 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
434 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
436 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
438 mask &= thread->validate;
441 if (mask & DPSOFTRAST_VALIDATE_FB)
443 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
444 DPSOFTRAST_RecalcFB(thread);
446 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
448 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
449 DPSOFTRAST_RecalcDepthFunc(thread);
451 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
453 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
454 DPSOFTRAST_RecalcBlendFunc(thread);
458 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
460 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
461 return &dpsoftrast.texture[index];
465 static void DPSOFTRAST_Texture_Grow(void)
467 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
468 DPSOFTRAST_State_Thread *thread;
472 // expand texture array as needed
473 if (dpsoftrast.texture_max < 1024)
474 dpsoftrast.texture_max = 1024;
476 dpsoftrast.texture_max *= 2;
477 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
478 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
479 if (dpsoftrast.texbound[i])
480 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
481 for (j = 0; j < dpsoftrast.numthreads; j++)
483 thread = &dpsoftrast.threads[j];
484 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
485 if (thread->texbound[i])
486 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
490 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
499 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
500 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
501 DPSOFTRAST_Texture *texture;
502 if (width*height*depth < 1)
504 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
507 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
509 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
514 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
515 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
516 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
518 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
519 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
521 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
526 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
529 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
531 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
536 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
538 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
541 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
543 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
546 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
548 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
551 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
553 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
556 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
558 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
561 // find first empty slot in texture array
562 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
563 if (!dpsoftrast.texture[texnum].bytes)
565 dpsoftrast.texture_firstfree = texnum + 1;
566 if (dpsoftrast.texture_max <= texnum)
567 DPSOFTRAST_Texture_Grow();
568 if (dpsoftrast.texture_end <= texnum)
569 dpsoftrast.texture_end = texnum + 1;
570 texture = &dpsoftrast.texture[texnum];
571 memset(texture, 0, sizeof(*texture));
572 texture->flags = flags;
573 texture->width = width;
574 texture->height = height;
575 texture->depth = depth;
576 texture->sides = sides;
588 s = w * h * d * sides * 4;
589 texture->mipmap[mipmaps][0] = size;
590 texture->mipmap[mipmaps][1] = s;
591 texture->mipmap[mipmaps][2] = w;
592 texture->mipmap[mipmaps][3] = h;
593 texture->mipmap[mipmaps][4] = d;
596 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
602 texture->mipmaps = mipmaps;
603 texture->size = size;
605 // allocate the pixels now
606 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
610 void DPSOFTRAST_Texture_Free(int index)
612 DPSOFTRAST_Texture *texture;
613 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
617 MM_FREE(texture->bytes);
618 texture->bytes = NULL;
619 memset(texture, 0, sizeof(*texture));
620 // adjust the free range and used range
621 if (dpsoftrast.texture_firstfree > index)
622 dpsoftrast.texture_firstfree = index;
623 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
624 dpsoftrast.texture_end--;
626 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
628 int i, x, y, z, w, layer0, layer1, row0, row1;
629 unsigned char *o, *i0, *i1, *i2, *i3;
630 DPSOFTRAST_Texture *texture;
631 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
632 if (texture->mipmaps <= 1)
634 for (i = 1;i < texture->mipmaps;i++)
636 for (z = 0;z < texture->mipmap[i][4];z++)
640 if (layer1 >= texture->mipmap[i-1][4])
641 layer1 = texture->mipmap[i-1][4]-1;
642 for (y = 0;y < texture->mipmap[i][3];y++)
646 if (row1 >= texture->mipmap[i-1][3])
647 row1 = texture->mipmap[i-1][3]-1;
648 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
649 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
650 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
651 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
652 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
653 w = texture->mipmap[i][2];
656 if (texture->mipmap[i-1][2] > 1)
658 // average 3D texture
659 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
661 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
662 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
663 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
664 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
669 // average 3D mipmap with parent width == 1
670 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
672 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
673 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
674 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
675 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
681 if (texture->mipmap[i-1][2] > 1)
683 // average 2D texture (common case)
684 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
686 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
687 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
688 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
689 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
694 // 2D texture with parent width == 1
695 o[0] = (i0[0] + i1[0] + 1) >> 1;
696 o[1] = (i0[1] + i1[1] + 1) >> 1;
697 o[2] = (i0[2] + i1[2] + 1) >> 1;
698 o[3] = (i0[3] + i1[3] + 1) >> 1;
705 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
707 DPSOFTRAST_Texture *texture;
709 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
714 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
715 while (blockheight > 0)
717 memcpy(dst, pixels, blockwidth * 4);
718 pixels += blockwidth * 4;
719 dst += texture->mipmap[0][2] * 4;
723 DPSOFTRAST_Texture_CalculateMipmaps(index);
725 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
727 DPSOFTRAST_Texture *texture;
728 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
732 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
733 DPSOFTRAST_Texture_CalculateMipmaps(index);
735 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
737 DPSOFTRAST_Texture *texture;
738 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
739 return texture->mipmap[mip][2];
741 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
743 DPSOFTRAST_Texture *texture;
744 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
745 return texture->mipmap[mip][3];
747 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
749 DPSOFTRAST_Texture *texture;
750 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
751 return texture->mipmap[mip][4];
753 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
755 DPSOFTRAST_Texture *texture;
756 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
759 return texture->bytes + texture->mipmap[mip][0];
761 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
763 DPSOFTRAST_Texture *texture;
764 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
765 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
767 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
772 texture->filter = filter;
775 static void DPSOFTRAST_Draw_FlushThreads(void);
777 static void DPSOFTRAST_Draw_SyncCommands(void)
779 if(dpsoftrast.usethreads) MEMORY_BARRIER;
780 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
783 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
785 DPSOFTRAST_State_Thread *thread;
787 int freecommand = dpsoftrast.commandpool.freecommand;
788 int usedcommands = dpsoftrast.commandpool.usedcommands;
789 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
791 DPSOFTRAST_Draw_SyncCommands();
797 for (i = 0; i < dpsoftrast.numthreads; i++)
799 thread = &dpsoftrast.threads[i];
800 commandoffset = freecommand - thread->commandoffset;
801 if (commandoffset < 0)
802 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
803 if (commandoffset > usedcommands)
806 usedcommands = commandoffset;
809 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
811 thread = &dpsoftrast.threads[waitindex];
812 Thread_LockMutex(thread->drawmutex);
813 if (thread->commandoffset != dpsoftrast.drawcommand)
815 thread->waiting = true;
816 if (thread->starving) Thread_CondSignal(thread->drawcond);
817 Thread_CondWait(thread->waitcond, thread->drawmutex);
818 thread->waiting = false;
820 Thread_UnlockMutex(thread->drawmutex);
822 dpsoftrast.commandpool.usedcommands = usedcommands;
825 #define DPSOFTRAST_ALIGNCOMMAND(size) \
826 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
827 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
828 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
830 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
832 DPSOFTRAST_Command *command;
833 int freecommand = dpsoftrast.commandpool.freecommand;
834 int usedcommands = dpsoftrast.commandpool.usedcommands;
835 int extra = sizeof(DPSOFTRAST_Command);
836 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
837 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
838 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
840 if (dpsoftrast.usethreads)
841 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
843 DPSOFTRAST_Draw_FlushThreads();
844 freecommand = dpsoftrast.commandpool.freecommand;
845 usedcommands = dpsoftrast.commandpool.usedcommands;
847 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
849 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
850 command->opcode = DPSOFTRAST_OPCODE_Reset;
851 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
854 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
855 command->opcode = opcode;
856 command->commandsize = size;
858 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
860 dpsoftrast.commandpool.freecommand = freecommand;
861 dpsoftrast.commandpool.usedcommands = usedcommands + size;
865 static void DPSOFTRAST_UndoCommand(int size)
867 int freecommand = dpsoftrast.commandpool.freecommand;
868 int usedcommands = dpsoftrast.commandpool.usedcommands;
871 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
872 usedcommands -= size;
873 dpsoftrast.commandpool.freecommand = freecommand;
874 dpsoftrast.commandpool.usedcommands = usedcommands;
877 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
878 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
880 thread->viewport[0] = command->x;
881 thread->viewport[1] = command->y;
882 thread->viewport[2] = command->width;
883 thread->viewport[3] = command->height;
884 thread->validate |= DPSOFTRAST_VALIDATE_FB;
886 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
888 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
891 command->width = width;
892 command->height = height;
894 dpsoftrast.viewport[0] = x;
895 dpsoftrast.viewport[1] = y;
896 dpsoftrast.viewport[2] = width;
897 dpsoftrast.viewport[3] = height;
898 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
901 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
902 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
904 int i, x1, y1, x2, y2, w, h, x, y;
905 int miny1, maxy1, miny2, maxy2;
909 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
910 miny1 = thread->miny1;
911 maxy1 = thread->maxy1;
912 miny2 = thread->miny2;
913 maxy2 = thread->maxy2;
914 x1 = thread->fb_scissor[0];
915 y1 = thread->fb_scissor[1];
916 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
917 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
918 if (y1 < miny1) y1 = miny1;
919 if (y2 > maxy2) y2 = maxy2;
924 // FIXME: honor fb_colormask?
925 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
926 for (i = 0;i < 4;i++)
928 if (!dpsoftrast.fb_colorpixels[i])
930 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
933 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
934 for (x = x1;x < x2;x++)
939 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
941 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
948 DEFCOMMAND(3, ClearDepth, float depth;)
949 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
951 int x1, y1, x2, y2, w, h, x, y;
952 int miny1, maxy1, miny2, maxy2;
956 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
957 miny1 = thread->miny1;
958 maxy1 = thread->maxy1;
959 miny2 = thread->miny2;
960 maxy2 = thread->maxy2;
961 x1 = thread->fb_scissor[0];
962 y1 = thread->fb_scissor[1];
963 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
964 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
965 if (y1 < miny1) y1 = miny1;
966 if (y2 > maxy2) y2 = maxy2;
971 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
972 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
975 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
976 for (x = x1;x < x2;x++)
980 void DPSOFTRAST_ClearDepth(float d)
982 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
986 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
987 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
989 thread->colormask[0] = command->r != 0;
990 thread->colormask[1] = command->g != 0;
991 thread->colormask[2] = command->b != 0;
992 thread->colormask[3] = command->a != 0;
993 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
995 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
997 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1004 DEFCOMMAND(5, DepthTest, int enable;)
1005 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1007 thread->depthtest = command->enable;
1008 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1010 void DPSOFTRAST_DepthTest(int enable)
1012 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1013 command->enable = enable;
1016 DEFCOMMAND(6, ScissorTest, int enable;)
1017 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1019 thread->scissortest = command->enable;
1020 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1022 void DPSOFTRAST_ScissorTest(int enable)
1024 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1025 command->enable = enable;
1028 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1029 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1031 thread->scissor[0] = command->x;
1032 thread->scissor[1] = command->y;
1033 thread->scissor[2] = command->width;
1034 thread->scissor[3] = command->height;
1035 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1037 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1039 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1042 command->width = width;
1043 command->height = height;
1046 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1047 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1049 thread->blendfunc[0] = command->sfactor;
1050 thread->blendfunc[1] = command->dfactor;
1051 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1053 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1055 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1056 command->sfactor = sfactor;
1057 command->dfactor = dfactor;
1060 DEFCOMMAND(9, BlendSubtract, int enable;)
1061 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1063 thread->blendsubtract = command->enable;
1064 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1066 void DPSOFTRAST_BlendSubtract(int enable)
1068 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1069 command->enable = enable;
1072 DEFCOMMAND(10, DepthMask, int enable;)
1073 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1075 thread->depthmask = command->enable;
1077 void DPSOFTRAST_DepthMask(int enable)
1079 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1080 command->enable = enable;
1083 DEFCOMMAND(11, DepthFunc, int func;)
1084 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1086 thread->depthfunc = command->func;
1088 void DPSOFTRAST_DepthFunc(int func)
1090 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1091 command->func = func;
1094 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1095 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1097 thread->depthrange[0] = command->nearval;
1098 thread->depthrange[1] = command->farval;
1100 void DPSOFTRAST_DepthRange(float nearval, float farval)
1102 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1103 command->nearval = nearval;
1104 command->farval = farval;
1107 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1108 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1110 thread->polygonoffset[0] = command->alongnormal;
1111 thread->polygonoffset[1] = command->intoview;
1113 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1115 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1116 command->alongnormal = alongnormal;
1117 command->intoview = intoview;
1120 DEFCOMMAND(14, CullFace, int mode;)
1121 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1123 thread->cullface = command->mode;
1125 void DPSOFTRAST_CullFace(int mode)
1127 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1128 command->mode = mode;
1131 DEFCOMMAND(15, AlphaTest, int enable;)
1132 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1134 thread->alphatest = command->enable;
1136 void DPSOFTRAST_AlphaTest(int enable)
1138 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1139 command->enable = enable;
1142 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1143 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1145 thread->alphafunc = command->func;
1146 thread->alphavalue = command->ref;
1148 void DPSOFTRAST_AlphaFunc(int func, float ref)
1150 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1151 command->func = func;
1155 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1157 dpsoftrast.color[0] = r;
1158 dpsoftrast.color[1] = g;
1159 dpsoftrast.color[2] = b;
1160 dpsoftrast.color[3] = a;
1163 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1165 int outstride = blockwidth * 4;
1166 int instride = dpsoftrast.fb_width * 4;
1169 int bx2 = blockx + blockwidth;
1170 int by2 = blocky + blockheight;
1174 unsigned char *inpixels;
1178 if (bx1 < 0) bx1 = 0;
1179 if (by1 < 0) by1 = 0;
1180 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1181 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1183 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1184 if (dpsoftrast.bigendian)
1186 for (y = by1;y < by2;y++)
1188 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1189 o = (unsigned char *)outpixels + (y - by1) * outstride;
1190 for (x = bx1;x < bx2;x++)
1203 for (y = by1;y < by2;y++)
1205 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1206 o = (unsigned char *)outpixels + (y - by1) * outstride;
1212 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1216 int tx2 = tx + width;
1217 int ty2 = ty + height;
1220 int sx2 = sx + width;
1221 int sy2 = sy + height;
1231 unsigned int *spixels;
1232 unsigned int *tpixels;
1233 DPSOFTRAST_Texture *texture;
1234 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1235 if (mip < 0 || mip >= texture->mipmaps) return;
1237 spixels = dpsoftrast.fb_colorpixels[0];
1238 swidth = dpsoftrast.fb_width;
1239 sheight = dpsoftrast.fb_height;
1240 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1241 twidth = texture->mipmap[mip][2];
1242 theight = texture->mipmap[mip][3];
1243 if (tx1 < 0) tx1 = 0;
1244 if (ty1 < 0) ty1 = 0;
1245 if (tx2 > twidth) tx2 = twidth;
1246 if (ty2 > theight) ty2 = theight;
1247 if (sx1 < 0) sx1 = 0;
1248 if (sy1 < 0) sy1 = 0;
1249 if (sx2 > swidth) sx2 = swidth;
1250 if (sy2 > sheight) sy2 = sheight;
1255 if (tw > sw) tw = sw;
1256 if (th > sh) th = sh;
1257 if (tw < 1 || th < 1)
1259 sy1 = sheight - 1 - sy1;
1260 for (y = 0;y < th;y++)
1261 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1262 if (texture->mipmaps > 1)
1263 DPSOFTRAST_Texture_CalculateMipmaps(index);
1266 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1267 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1269 if (thread->texbound[command->unitnum])
1270 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1271 thread->texbound[command->unitnum] = command->texture;
1273 void DPSOFTRAST_SetTexture(int unitnum, int index)
1275 DPSOFTRAST_Command_SetTexture *command;
1276 DPSOFTRAST_Texture *texture;
1277 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1279 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1282 texture = DPSOFTRAST_Texture_GetByIndex(index);
1283 if (index && !texture)
1285 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1289 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1290 command->unitnum = unitnum;
1291 command->texture = texture;
1293 dpsoftrast.texbound[unitnum] = texture;
1294 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1297 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1299 dpsoftrast.pointer_vertex3f = vertex3f;
1300 dpsoftrast.stride_vertex = stride;
1302 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1304 dpsoftrast.pointer_color4f = color4f;
1305 dpsoftrast.pointer_color4ub = NULL;
1306 dpsoftrast.stride_color = stride;
1308 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1310 dpsoftrast.pointer_color4f = NULL;
1311 dpsoftrast.pointer_color4ub = color4ub;
1312 dpsoftrast.stride_color = stride;
1314 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1316 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1317 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1318 dpsoftrast.stride_texcoord[unitnum] = stride;
1321 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1322 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1324 thread->shader_mode = command->mode;
1325 thread->shader_permutation = command->permutation;
1326 thread->shader_exactspecularmath = command->exactspecularmath;
1328 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1330 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1331 command->mode = mode;
1332 command->permutation = permutation;
1333 command->exactspecularmath = exactspecularmath;
1335 dpsoftrast.shader_mode = mode;
1336 dpsoftrast.shader_permutation = permutation;
1337 dpsoftrast.shader_exactspecularmath = exactspecularmath;
1340 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1341 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1343 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1345 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1347 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1348 command->index = index;
1349 command->val[0] = v0;
1350 command->val[1] = v1;
1351 command->val[2] = v2;
1352 command->val[3] = v3;
1354 dpsoftrast.uniform4f[index*4+0] = v0;
1355 dpsoftrast.uniform4f[index*4+1] = v1;
1356 dpsoftrast.uniform4f[index*4+2] = v2;
1357 dpsoftrast.uniform4f[index*4+3] = v3;
1359 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1361 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1362 command->index = index;
1363 memcpy(command->val, v, sizeof(command->val));
1365 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1368 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1369 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1371 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1373 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1377 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1379 __m128 m0, m1, m2, m3;
1380 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1381 command->index = (DPSOFTRAST_UNIFORM)index;
1382 if (((size_t)v)&(ALIGN_SIZE-1))
1384 m0 = _mm_loadu_ps(v);
1385 m1 = _mm_loadu_ps(v+4);
1386 m2 = _mm_loadu_ps(v+8);
1387 m3 = _mm_loadu_ps(v+12);
1391 m0 = _mm_load_ps(v);
1392 m1 = _mm_load_ps(v+4);
1393 m2 = _mm_load_ps(v+8);
1394 m3 = _mm_load_ps(v+12);
1398 __m128 t0, t1, t2, t3;
1399 t0 = _mm_unpacklo_ps(m0, m1);
1400 t1 = _mm_unpacklo_ps(m2, m3);
1401 t2 = _mm_unpackhi_ps(m0, m1);
1402 t3 = _mm_unpackhi_ps(m2, m3);
1403 m0 = _mm_movelh_ps(t0, t1);
1404 m1 = _mm_movehl_ps(t1, t0);
1405 m2 = _mm_movelh_ps(t2, t3);
1406 m3 = _mm_movehl_ps(t3, t2);
1408 _mm_store_ps(command->val, m0);
1409 _mm_store_ps(command->val+4, m1);
1410 _mm_store_ps(command->val+8, m2);
1411 _mm_store_ps(command->val+12, m3);
1412 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1413 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1414 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1415 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1420 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1421 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1423 thread->uniform1i[command->index] = command->val;
1425 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1427 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1428 command->index = index;
1431 dpsoftrast.uniform1i[command->index] = i0;
1435 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1437 float *end = dst + size*4;
1438 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1442 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1451 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1458 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1460 float *end = dst + size*4;
1461 if (stride == sizeof(float[3]))
1463 float *end4 = dst + (size&~3)*4;
1464 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1468 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1469 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1470 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1471 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1472 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1473 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1474 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1475 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1476 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1477 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1478 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1479 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1480 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1482 src += 4*sizeof(float[3]);
1489 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1490 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1491 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1492 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1493 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1494 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1495 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1496 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1497 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1498 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1499 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1500 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1501 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1503 src += 4*sizeof(float[3]);
1507 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1511 __m128 v = _mm_loadu_ps((const float *)src);
1512 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1513 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1514 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1515 _mm_store_ps(dst, v);
1524 __m128 v = _mm_load_ps((const float *)src);
1525 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1526 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1527 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1528 _mm_store_ps(dst, v);
1535 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1537 float *end = dst + size*4;
1538 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1539 if (stride == sizeof(float[2]))
1541 float *end2 = dst + (size&~1)*4;
1542 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1546 __m128 v = _mm_loadu_ps((const float *)src);
1547 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1548 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1550 src += 2*sizeof(float[2]);
1557 __m128 v = _mm_load_ps((const float *)src);
1558 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1559 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1561 src += 2*sizeof(float[2]);
1567 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1573 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1575 float *end = dst + size*4;
1576 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1577 if (stride == sizeof(unsigned char[4]))
1579 float *end4 = dst + (size&~3)*4;
1580 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1584 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1585 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1586 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1587 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1588 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1590 src += 4*sizeof(unsigned char[4]);
1597 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1598 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1599 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1600 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1601 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1603 src += 4*sizeof(unsigned char[4]);
1609 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1610 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1616 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1618 float *end = dst + 4*size;
1619 __m128 v = _mm_loadu_ps(src);
1622 _mm_store_ps(dst, v);
1628 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1631 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1632 __m128 m0, m1, m2, m3;
1634 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1636 // fast case for identity matrix
1637 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1640 end = out4f + numitems*4;
1641 m0 = _mm_loadu_ps(inmatrix16f);
1642 m1 = _mm_loadu_ps(inmatrix16f + 4);
1643 m2 = _mm_loadu_ps(inmatrix16f + 8);
1644 m3 = _mm_loadu_ps(inmatrix16f + 12);
1645 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1649 __m128 v = _mm_loadu_ps(in4f);
1651 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1652 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1653 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1654 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1663 __m128 v = _mm_load_ps(in4f);
1665 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1666 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1667 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1668 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1676 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1678 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1682 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1684 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1685 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1686 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1687 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1690 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1692 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1693 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1694 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1695 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1698 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1701 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1702 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1703 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1704 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1707 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1709 int clipmask = 0xFF;
1710 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1711 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1712 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1713 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1714 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1715 #define BBFRONT(k, pos) \
1717 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1718 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1719 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1722 clipmask &= ~(1<<k); \
1723 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1724 minproj = _mm_min_ss(minproj, proj); \
1725 maxproj = _mm_max_ss(maxproj, proj); \
1729 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1730 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1731 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1732 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1733 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1734 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1738 if (clipmask&(1<<k)) \
1740 if (!(clipmask&(1<<(k^1)))) \
1742 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1743 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1744 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1745 minproj = _mm_min_ss(minproj, proj); \
1746 maxproj = _mm_max_ss(maxproj, proj); \
1748 if (!(clipmask&(1<<(k^2)))) \
1750 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1751 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1752 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1753 minproj = _mm_min_ss(minproj, proj); \
1754 maxproj = _mm_max_ss(maxproj, proj); \
1756 if (!(clipmask&(1<<(k^4)))) \
1758 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1759 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1760 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1761 minproj = _mm_min_ss(minproj, proj); \
1762 maxproj = _mm_max_ss(maxproj, proj); \
1766 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1767 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1768 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1769 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1770 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1771 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1772 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1773 *starty = _mm_cvttss_si32(maxproj);
1774 *endy = _mm_cvttss_si32(minproj)+1;
1778 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1780 float *end = out4f + numitems*4;
1781 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1782 __m128 minpos, maxpos;
1783 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1785 minpos = maxpos = _mm_loadu_ps(in4f);
1788 __m128 v = _mm_loadu_ps(in4f);
1789 minpos = _mm_min_ps(minpos, v);
1790 maxpos = _mm_max_ps(maxpos, v);
1791 _mm_store_ps(out4f, v);
1792 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1793 _mm_store_ps(screen4f, v);
1801 minpos = maxpos = _mm_load_ps(in4f);
1804 __m128 v = _mm_load_ps(in4f);
1805 minpos = _mm_min_ps(minpos, v);
1806 maxpos = _mm_max_ps(maxpos, v);
1807 _mm_store_ps(out4f, v);
1808 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1809 _mm_store_ps(screen4f, v);
1816 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale,
1817 _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1818 _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1819 _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1820 _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1824 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1826 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1827 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1829 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1830 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1831 end = out4f + numitems*4;
1832 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1833 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1834 m0 = _mm_loadu_ps(inmatrix16f);
1835 m1 = _mm_loadu_ps(inmatrix16f + 4);
1836 m2 = _mm_loadu_ps(inmatrix16f + 8);
1837 m3 = _mm_loadu_ps(inmatrix16f + 12);
1838 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1840 minpos = maxpos = _mm_loadu_ps(in4f);
1843 __m128 v = _mm_loadu_ps(in4f);
1844 minpos = _mm_min_ps(minpos, v);
1845 maxpos = _mm_max_ps(maxpos, v);
1846 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1847 _mm_store_ps(out4f, v);
1848 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1849 _mm_store_ps(screen4f, v);
1857 minpos = maxpos = _mm_load_ps(in4f);
1860 __m128 v = _mm_load_ps(in4f);
1861 minpos = _mm_min_ps(minpos, v);
1862 maxpos = _mm_max_ps(maxpos, v);
1863 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1864 _mm_store_ps(out4f, v);
1865 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1866 _mm_store_ps(screen4f, v);
1873 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3);
1878 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1881 float *outf = dpsoftrast.post_array4f[outarray];
1882 const unsigned char *inb;
1883 int firstvertex = dpsoftrast.firstvertex;
1884 int numvertices = dpsoftrast.numvertices;
1888 case DPSOFTRAST_ARRAY_POSITION:
1889 stride = dpsoftrast.stride_vertex;
1890 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1891 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1893 case DPSOFTRAST_ARRAY_COLOR:
1894 stride = dpsoftrast.stride_color;
1895 if (dpsoftrast.pointer_color4f)
1897 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1898 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1900 else if (dpsoftrast.pointer_color4ub)
1902 stride = dpsoftrast.stride_color;
1903 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1904 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1908 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1912 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1913 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1915 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1916 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1919 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1922 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1925 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1937 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1939 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1940 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1945 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1948 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1949 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1957 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1960 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1961 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1968 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1971 int startx = span->startx;
1972 int endx = span->endx;
1973 float wslope = triangle->w[0];
1974 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1975 float endz = 1.0f / (w + wslope * startx);
1976 for (x = startx;x < endx;)
1978 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1980 if (nextsub >= endx) nextsub = endsub = endx-1;
1981 endz = 1.0f / (w + wslope * nextsub);
1982 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1983 for (; x <= endsub; x++, z += dz)
1988 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1991 int startx = span->startx;
1992 int endx = span->endx;
1995 unsigned char * RESTRICT pixelmask = span->pixelmask;
1996 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1999 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2000 // handle alphatest now (this affects depth writes too)
2001 if (thread->alphatest)
2002 for (x = startx;x < endx;x++)
2003 if (in4f[x*4+3] < 0.5f)
2004 pixelmask[x] = false;
2005 // FIXME: this does not handle bigendian
2006 switch(thread->fb_blendmode)
2008 case DPSOFTRAST_BLENDMODE_OPAQUE:
2009 for (x = startx;x < endx;x++)
2013 d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
2014 d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
2015 d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
2016 d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
2017 pixel[x*4+0] = d[0];
2018 pixel[x*4+1] = d[1];
2019 pixel[x*4+2] = d[2];
2020 pixel[x*4+3] = d[3];
2023 case DPSOFTRAST_BLENDMODE_ALPHA:
2024 for (x = startx;x < endx;x++)
2028 a = in4f[x*4+3] * 255.0f;
2029 b = 1.0f - in4f[x*4+3];
2030 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2031 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2032 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2033 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2034 pixel[x*4+0] = d[0];
2035 pixel[x*4+1] = d[1];
2036 pixel[x*4+2] = d[2];
2037 pixel[x*4+3] = d[3];
2040 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2041 for (x = startx;x < endx;x++)
2045 a = in4f[x*4+3] * 255.0f;
2046 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2047 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2048 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2049 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2050 pixel[x*4+0] = d[0];
2051 pixel[x*4+1] = d[1];
2052 pixel[x*4+2] = d[2];
2053 pixel[x*4+3] = d[3];
2056 case DPSOFTRAST_BLENDMODE_ADD:
2057 for (x = startx;x < endx;x++)
2061 d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2062 d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2063 d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2064 d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2065 pixel[x*4+0] = d[0];
2066 pixel[x*4+1] = d[1];
2067 pixel[x*4+2] = d[2];
2068 pixel[x*4+3] = d[3];
2071 case DPSOFTRAST_BLENDMODE_INVMOD:
2072 for (x = startx;x < endx;x++)
2076 d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2077 d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2078 d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2079 d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2080 pixel[x*4+0] = d[0];
2081 pixel[x*4+1] = d[1];
2082 pixel[x*4+2] = d[2];
2083 pixel[x*4+3] = d[3];
2086 case DPSOFTRAST_BLENDMODE_MUL:
2087 for (x = startx;x < endx;x++)
2091 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2092 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2093 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2094 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2095 pixel[x*4+0] = d[0];
2096 pixel[x*4+1] = d[1];
2097 pixel[x*4+2] = d[2];
2098 pixel[x*4+3] = d[3];
2101 case DPSOFTRAST_BLENDMODE_MUL2:
2102 for (x = startx;x < endx;x++)
2106 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2107 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2108 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2109 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2110 pixel[x*4+0] = d[0];
2111 pixel[x*4+1] = d[1];
2112 pixel[x*4+2] = d[2];
2113 pixel[x*4+3] = d[3];
2116 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2117 for (x = startx;x < endx;x++)
2121 a = in4f[x*4+3] * -255.0f;
2122 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2123 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2124 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2125 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2126 pixel[x*4+0] = d[0];
2127 pixel[x*4+1] = d[1];
2128 pixel[x*4+2] = d[2];
2129 pixel[x*4+3] = d[3];
2132 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2133 for (x = startx;x < endx;x++)
2138 b = 1.0f - in4f[x*4+3];
2139 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2140 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2141 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2142 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2143 pixel[x*4+0] = d[0];
2144 pixel[x*4+1] = d[1];
2145 pixel[x*4+2] = d[2];
2146 pixel[x*4+3] = d[3];
2149 case DPSOFTRAST_BLENDMODE_INVADD:
2150 for (x = startx;x < endx;x++)
2154 d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2155 d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2156 d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2157 d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2158 pixel[x*4+0] = d[0];
2159 pixel[x*4+1] = d[1];
2160 pixel[x*4+2] = d[2];
2161 pixel[x*4+3] = d[3];
2167 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2171 int startx = span->startx;
2172 int endx = span->endx;
2173 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2174 unsigned char * RESTRICT pixelmask = span->pixelmask;
2175 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2176 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2179 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2180 pixeli += span->y * dpsoftrast.fb_width + span->x;
2181 // handle alphatest now (this affects depth writes too)
2182 if (thread->alphatest)
2183 for (x = startx;x < endx;x++)
2184 if (in4ub[x*4+3] < 0.5f)
2185 pixelmask[x] = false;
2186 // FIXME: this does not handle bigendian
2187 switch(thread->fb_blendmode)
2189 case DPSOFTRAST_BLENDMODE_OPAQUE:
2190 for (x = startx;x + 4 <= endx;)
2192 if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2194 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2208 case DPSOFTRAST_BLENDMODE_ALPHA:
2209 #define FINISHBLEND(blend2, blend1) \
2210 for (x = startx;x + 1 < endx;x += 2) \
2213 switch (*(const unsigned short*)&pixelmask[x]) \
2216 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2217 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2219 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2222 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2223 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2225 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2228 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2229 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2231 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2236 for(;x < endx; x++) \
2239 if (!pixelmask[x]) \
2241 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2242 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2244 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2248 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2249 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2251 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2252 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2255 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2257 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2258 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2260 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2261 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2264 case DPSOFTRAST_BLENDMODE_ADD:
2265 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2267 case DPSOFTRAST_BLENDMODE_INVMOD:
2269 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2271 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2274 case DPSOFTRAST_BLENDMODE_MUL:
2275 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2277 case DPSOFTRAST_BLENDMODE_MUL2:
2278 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2280 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2282 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2283 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2285 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2286 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2289 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2291 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2292 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2294 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2295 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2298 case DPSOFTRAST_BLENDMODE_INVADD:
2300 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2302 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2309 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2312 int startx = span->startx;
2313 int endx = span->endx;
2318 float tc[2], endtc[2];
2320 unsigned int tci[2];
2321 unsigned int tci1[2];
2322 unsigned int tcimin[2];
2323 unsigned int tcimax[2];
2328 const unsigned char * RESTRICT pixelbase;
2329 const unsigned char * RESTRICT pixel[4];
2330 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2331 // if no texture is bound, just fill it with white
2334 for (x = startx;x < endx;x++)
2336 out4f[x*4+0] = 1.0f;
2337 out4f[x*4+1] = 1.0f;
2338 out4f[x*4+2] = 1.0f;
2339 out4f[x*4+3] = 1.0f;
2343 mip = triangle->mip[texunitindex];
2344 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2345 // if this mipmap of the texture is 1 pixel, just fill it with that color
2346 if (texture->mipmap[mip][1] == 4)
2348 c[0] = texture->bytes[2] * (1.0f/255.0f);
2349 c[1] = texture->bytes[1] * (1.0f/255.0f);
2350 c[2] = texture->bytes[0] * (1.0f/255.0f);
2351 c[3] = texture->bytes[3] * (1.0f/255.0f);
2352 for (x = startx;x < endx;x++)
2354 out4f[x*4+0] = c[0];
2355 out4f[x*4+1] = c[1];
2356 out4f[x*4+2] = c[2];
2357 out4f[x*4+3] = c[3];
2361 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2362 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2363 flags = texture->flags;
2364 tcscale[0] = texture->mipmap[mip][2];
2365 tcscale[1] = texture->mipmap[mip][3];
2366 tciwidth = texture->mipmap[mip][2];
2369 tcimax[0] = texture->mipmap[mip][2]-1;
2370 tcimax[1] = texture->mipmap[mip][3]-1;
2371 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2372 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2373 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2374 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2375 for (x = startx;x < endx;)
2377 unsigned int subtc[2];
2378 unsigned int substep[2];
2379 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2380 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2381 if (nextsub >= endx)
2383 nextsub = endsub = endx-1;
2384 if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2388 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2389 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2390 substep[0] = (endtc[0] - tc[0]) * subscale;
2391 substep[1] = (endtc[1] - tc[1]) * subscale;
2392 subtc[0] = tc[0] * (1<<16);
2393 subtc[1] = tc[1] * (1<<16);
2396 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2398 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2400 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2401 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2402 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2403 tci[0] = subtc[0]>>16;
2404 tci[1] = subtc[1]>>16;
2405 tci1[0] = tci[0] + 1;
2406 tci1[1] = tci[1] + 1;
2407 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2408 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2409 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2410 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2411 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2412 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2413 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2414 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2415 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2416 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2417 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2418 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2419 out4f[x*4+0] = c[0];
2420 out4f[x*4+1] = c[1];
2421 out4f[x*4+2] = c[2];
2422 out4f[x*4+3] = c[3];
2427 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2429 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2430 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2431 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2432 tci[0] = subtc[0]>>16;
2433 tci[1] = subtc[1]>>16;
2434 tci1[0] = tci[0] + 1;
2435 tci1[1] = tci[1] + 1;
2436 tci[0] &= tciwrapmask[0];
2437 tci[1] &= tciwrapmask[1];
2438 tci1[0] &= tciwrapmask[0];
2439 tci1[1] &= tciwrapmask[1];
2440 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2441 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2442 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2443 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2444 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2445 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2446 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2447 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2448 out4f[x*4+0] = c[0];
2449 out4f[x*4+1] = c[1];
2450 out4f[x*4+2] = c[2];
2451 out4f[x*4+3] = c[3];
2455 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2457 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2459 tci[0] = subtc[0]>>16;
2460 tci[1] = subtc[1]>>16;
2461 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2462 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2463 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2464 c[0] = pixel[0][2] * (1.0f / 255.0f);
2465 c[1] = pixel[0][1] * (1.0f / 255.0f);
2466 c[2] = pixel[0][0] * (1.0f / 255.0f);
2467 c[3] = pixel[0][3] * (1.0f / 255.0f);
2468 out4f[x*4+0] = c[0];
2469 out4f[x*4+1] = c[1];
2470 out4f[x*4+2] = c[2];
2471 out4f[x*4+3] = c[3];
2476 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2478 tci[0] = subtc[0]>>16;
2479 tci[1] = subtc[1]>>16;
2480 tci[0] &= tciwrapmask[0];
2481 tci[1] &= tciwrapmask[1];
2482 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2483 c[0] = pixel[0][2] * (1.0f / 255.0f);
2484 c[1] = pixel[0][1] * (1.0f / 255.0f);
2485 c[2] = pixel[0][0] * (1.0f / 255.0f);
2486 c[3] = pixel[0][3] * (1.0f / 255.0f);
2487 out4f[x*4+0] = c[0];
2488 out4f[x*4+1] = c[1];
2489 out4f[x*4+2] = c[2];
2490 out4f[x*4+3] = c[3];
2496 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2500 int startx = span->startx;
2501 int endx = span->endx;
2503 __m128 data, slope, tcscale;
2504 __m128i tcsize, tcmask, tcoffset, tcmax;
2506 __m128i subtc, substep, endsubtc;
2509 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2510 const unsigned char * RESTRICT pixelbase;
2511 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2512 // if no texture is bound, just fill it with white
2515 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2518 mip = triangle->mip[texunitindex];
2519 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2520 // if this mipmap of the texture is 1 pixel, just fill it with that color
2521 if (texture->mipmap[mip][1] == 4)
2523 unsigned int k = *((const unsigned int *)pixelbase);
2524 for (x = startx;x < endx;x++)
2528 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2529 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2530 flags = texture->flags;
2531 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2532 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2533 tcscale = _mm_cvtepi32_ps(tcsize);
2534 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2535 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2536 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2537 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2538 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2539 tcmax = _mm_packs_epi32(tcmask, tcmask);
2540 for (x = startx;x < endx;)
2542 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2543 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2544 if (nextsub >= endx)
2546 nextsub = endsub = endx-1;
2547 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2551 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2552 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2553 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2554 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2555 substep = _mm_slli_epi32(substep, 1);
2558 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2559 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2561 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2562 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2564 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2565 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2566 tci = _mm_madd_epi16(tci, tcoffset);
2567 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2568 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2569 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2570 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2571 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2572 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2573 fracm = _mm_srli_epi16(subtc, 1);
2574 pix1 = _mm_add_epi16(pix1,
2575 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2576 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2577 pix3 = _mm_add_epi16(pix3,
2578 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2579 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2580 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2581 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2582 pix2 = _mm_add_epi16(pix2,
2583 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2584 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2585 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2589 const unsigned char * RESTRICT ptr1;
2590 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2591 tci = _mm_madd_epi16(tci, tcoffset);
2592 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2593 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2594 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2595 fracm = _mm_srli_epi16(subtc, 1);
2596 pix1 = _mm_add_epi16(pix1,
2597 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2598 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2599 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2600 pix1 = _mm_add_epi16(pix1,
2601 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2602 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2603 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2607 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2609 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2611 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2612 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2613 tci = _mm_madd_epi16(tci, tcoffset);
2614 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2615 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2616 _mm_setzero_si128());
2617 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2618 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2619 _mm_setzero_si128());
2620 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2621 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2622 tci = _mm_madd_epi16(tci, tcoffset);
2623 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2624 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2625 _mm_setzero_si128());
2626 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2627 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2628 _mm_setzero_si128());
2629 fracm = _mm_srli_epi16(subtc, 1);
2630 pix1 = _mm_add_epi16(pix1,
2631 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2632 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2633 pix3 = _mm_add_epi16(pix3,
2634 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2635 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2636 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2637 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2638 pix2 = _mm_add_epi16(pix2,
2639 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2640 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2641 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2645 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2646 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2647 tci = _mm_madd_epi16(tci, tcoffset);
2648 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2649 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2650 _mm_setzero_si128());
2651 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2652 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2653 _mm_setzero_si128());
2654 fracm = _mm_srli_epi16(subtc, 1);
2655 pix1 = _mm_add_epi16(pix1,
2656 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2657 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2658 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2659 pix1 = _mm_add_epi16(pix1,
2660 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2661 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2662 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2668 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2670 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2671 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2672 tci = _mm_madd_epi16(tci, tcoffset);
2673 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2674 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2675 _mm_setzero_si128());
2676 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2677 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2678 _mm_setzero_si128());
2679 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2680 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2681 tci = _mm_madd_epi16(tci, tcoffset);
2682 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2683 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2684 _mm_setzero_si128());
2685 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2686 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2687 _mm_setzero_si128());
2688 fracm = _mm_srli_epi16(subtc, 1);
2689 pix1 = _mm_add_epi16(pix1,
2690 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2691 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2692 pix3 = _mm_add_epi16(pix3,
2693 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2694 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2695 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2696 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2697 pix2 = _mm_add_epi16(pix2,
2698 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2699 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2700 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2704 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2705 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2706 tci = _mm_madd_epi16(tci, tcoffset);
2707 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2708 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2709 _mm_setzero_si128());
2710 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2711 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2712 _mm_setzero_si128());
2713 fracm = _mm_srli_epi16(subtc, 1);
2714 pix1 = _mm_add_epi16(pix1,
2715 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2716 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2717 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2718 pix1 = _mm_add_epi16(pix1,
2719 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2720 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2721 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2728 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2730 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2732 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2733 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2734 tci = _mm_madd_epi16(tci, tcoffset);
2735 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2736 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2740 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2741 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2742 tci = _mm_madd_epi16(tci, tcoffset);
2743 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2749 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2751 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2752 tci = _mm_and_si128(tci, tcmax);
2753 tci = _mm_madd_epi16(tci, tcoffset);
2754 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2755 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2759 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2760 tci = _mm_and_si128(tci, tcmax);
2761 tci = _mm_madd_epi16(tci, tcoffset);
2762 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2771 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2774 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2777 float DPSOFTRAST_SampleShadowmap(const float *vector)
2783 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2786 int startx = span->startx;
2787 int endx = span->endx;
2792 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2793 for (x = startx;x < endx;x++)
2796 c[0] = (data[0] + slope[0]*x) * z;
2797 c[1] = (data[1] + slope[1]*x) * z;
2798 c[2] = (data[2] + slope[2]*x) * z;
2799 c[3] = (data[3] + slope[3]*x) * z;
2800 out4f[x*4+0] = in4f[x*4+0] * c[0];
2801 out4f[x*4+1] = in4f[x*4+1] * c[1];
2802 out4f[x*4+2] = in4f[x*4+2] * c[2];
2803 out4f[x*4+3] = in4f[x*4+3] * c[3];
2807 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2810 int startx = span->startx;
2811 int endx = span->endx;
2816 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2817 for (x = startx;x < endx;x++)
2820 c[0] = (data[0] + slope[0]*x) * z;
2821 c[1] = (data[1] + slope[1]*x) * z;
2822 c[2] = (data[2] + slope[2]*x) * z;
2823 c[3] = (data[3] + slope[3]*x) * z;
2824 out4f[x*4+0] = c[0];
2825 out4f[x*4+1] = c[1];
2826 out4f[x*4+2] = c[2];
2827 out4f[x*4+3] = c[3];
2831 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2833 int x, startx = span->startx, endx = span->endx;
2834 float c[4], localcolor[4];
2835 localcolor[0] = subcolor[0];
2836 localcolor[1] = subcolor[1];
2837 localcolor[2] = subcolor[2];
2838 localcolor[3] = subcolor[3];
2839 for (x = startx;x < endx;x++)
2841 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2842 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2843 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2844 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2845 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2846 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2847 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2848 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2852 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2854 int x, startx = span->startx, endx = span->endx;
2855 for (x = startx;x < endx;x++)
2857 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2858 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2859 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2860 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2864 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2866 int x, startx = span->startx, endx = span->endx;
2867 for (x = startx;x < endx;x++)
2869 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2870 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2871 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2872 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2876 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2878 int x, startx = span->startx, endx = span->endx;
2880 for (x = startx;x < endx;x++)
2882 a = 1.0f - inb4f[x*4+3];
2884 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2885 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2886 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2887 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2891 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2893 int x, startx = span->startx, endx = span->endx;
2894 float localcolor[4], ilerp, lerp;
2895 localcolor[0] = color[0];
2896 localcolor[1] = color[1];
2897 localcolor[2] = color[2];
2898 localcolor[3] = color[3];
2899 ilerp = 1.0f - localcolor[3];
2900 lerp = localcolor[3];
2901 for (x = startx;x < endx;x++)
2903 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2904 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2905 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2906 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2912 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2916 int startx = span->startx;
2917 int endx = span->endx;
2920 __m128i submod, substep, endsubmod;
2921 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2922 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2923 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2924 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2925 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2926 for (x = startx; x < endx;)
2928 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2929 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2930 if (nextsub >= endx)
2932 nextsub = endsub = endx-1;
2933 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2937 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2938 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2939 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2940 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2941 substep = _mm_packs_epi32(substep, substep);
2942 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2944 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2945 pix = _mm_mulhi_epu16(pix, submod);
2946 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2950 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2951 pix = _mm_mulhi_epu16(pix, submod);
2952 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2959 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2963 int startx = span->startx;
2964 int endx = span->endx;
2967 __m128i submod, substep, endsubmod;
2968 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2969 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2970 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2971 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2972 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2973 for (x = startx; x < endx;)
2975 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2976 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2977 if (nextsub >= endx)
2979 nextsub = endsub = endx-1;
2980 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2984 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2985 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2986 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2987 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2988 substep = _mm_packs_epi32(substep, substep);
2989 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2991 __m128i pix = _mm_srai_epi16(submod, 4);
2992 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2996 __m128i pix = _mm_srai_epi16(submod, 4);
2997 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3004 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3007 int x, startx = span->startx, endx = span->endx;
3008 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3009 localcolor = _mm_packs_epi32(localcolor, localcolor);
3010 for (x = startx;x+2 <= endx;x+=2)
3012 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3013 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3014 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3015 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3019 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3020 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3021 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3022 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3027 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3030 int x, startx = span->startx, endx = span->endx;
3031 for (x = startx;x+2 <= endx;x+=2)
3033 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3034 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3035 pix1 = _mm_mulhi_epu16(pix1, pix2);
3036 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3040 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3041 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3042 pix1 = _mm_mulhi_epu16(pix1, pix2);
3043 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3048 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3051 int x, startx = span->startx, endx = span->endx;
3052 for (x = startx;x+2 <= endx;x+=2)
3054 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3055 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3056 pix1 = _mm_add_epi16(pix1, pix2);
3057 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3061 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3062 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3063 pix1 = _mm_add_epi16(pix1, pix2);
3064 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3069 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3072 int x, startx = span->startx, endx = span->endx;
3073 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3074 tint = _mm_packs_epi32(tint, tint);
3075 for (x = startx;x+2 <= endx;x+=2)
3077 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3078 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3079 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3080 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3084 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3085 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3086 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3087 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3092 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3095 int x, startx = span->startx, endx = span->endx;
3096 for (x = startx;x+2 <= endx;x+=2)
3098 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3099 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3100 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3101 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3102 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3106 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3107 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3108 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3109 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3110 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3115 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3118 int x, startx = span->startx, endx = span->endx;
3119 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3120 localcolor = _mm_packs_epi32(localcolor, localcolor);
3121 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3122 for (x = startx;x+2 <= endx;x+=2)
3124 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3125 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3126 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3130 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3131 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3132 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3139 void DPSOFTRAST_VertexShader_Generic(void)
3141 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3142 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3143 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3144 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3145 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3148 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3150 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3151 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3152 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3153 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3154 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3155 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3157 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3158 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3159 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3161 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3162 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3165 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3167 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3170 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3172 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3175 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3180 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3181 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3186 void DPSOFTRAST_VertexShader_PostProcess(void)
3188 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3189 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3190 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3193 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3195 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3196 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3197 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3198 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3199 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3200 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3201 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3203 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3204 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3206 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3207 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3209 // TODO: implement saturation
3211 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3213 // TODO: implement gammaramps
3215 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3220 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3222 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3225 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3227 // this is never called (because colormask is off when this shader is used)
3228 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3229 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3230 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3231 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3232 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3237 void DPSOFTRAST_VertexShader_FlatColor(void)
3239 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3240 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3243 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3246 unsigned char * RESTRICT pixelmask = span->pixelmask;
3247 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3248 int x, startx = span->startx, endx = span->endx;
3249 __m128i Color_Ambientm;
3250 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3251 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3252 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3253 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3254 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3255 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3256 pixel = buffer_FragColorbgra8;
3257 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3258 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3259 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3260 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3261 for (x = startx;x < endx;x++)
3264 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3267 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3268 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3269 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3270 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3276 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3277 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3278 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3280 if (pixel == buffer_FragColorbgra8)
3281 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3287 void DPSOFTRAST_VertexShader_VertexColor(void)
3289 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3290 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3291 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3294 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3297 unsigned char * RESTRICT pixelmask = span->pixelmask;
3298 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3299 int x, startx = span->startx, endx = span->endx;
3300 __m128i Color_Ambientm, Color_Diffusem;
3302 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3303 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3304 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3305 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3306 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3307 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3308 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3309 pixel = buffer_FragColorbgra8;
3310 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3311 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3312 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3313 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3314 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3315 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3316 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3317 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3318 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3319 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3320 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3321 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3322 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3323 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3325 __m128i color, mod, pix;
3326 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3329 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3330 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3331 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3332 data = _mm_add_ps(data, slope);
3333 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3334 data = _mm_add_ps(data, slope);
3335 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3336 data = _mm_add_ps(data, slope);
3337 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3338 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3339 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3340 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3341 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3342 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3348 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3349 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3350 mod = _mm_packs_epi32(mod, mod);
3351 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3352 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3354 if (pixel == buffer_FragColorbgra8)
3355 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3361 void DPSOFTRAST_VertexShader_Lightmap(void)
3363 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3364 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3365 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3368 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3371 unsigned char * RESTRICT pixelmask = span->pixelmask;
3372 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3373 int x, startx = span->startx, endx = span->endx;
3374 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3375 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3376 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3377 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3378 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3379 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3380 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3381 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3382 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3383 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3384 pixel = buffer_FragColorbgra8;
3385 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3386 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3387 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3388 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3389 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3390 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3391 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3392 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3394 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3395 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3396 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3397 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3398 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3399 for (x = startx;x < endx;x++)
3401 __m128i color, lightmap, glow, pix;
3402 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3405 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3406 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3407 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3408 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3409 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3410 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3411 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3412 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3413 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3414 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3420 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3421 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3422 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3423 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3424 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3425 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3430 for (x = startx;x < endx;x++)
3432 __m128i color, lightmap, pix;
3433 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3436 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3437 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3438 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3439 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3440 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3441 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3442 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3448 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3449 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3450 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3451 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3454 if (pixel == buffer_FragColorbgra8)
3455 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3460 void DPSOFTRAST_VertexShader_LightDirection(void);
3461 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3463 void DPSOFTRAST_VertexShader_FakeLight(void)
3465 DPSOFTRAST_VertexShader_LightDirection();
3468 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3470 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3475 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3477 DPSOFTRAST_VertexShader_LightDirection();
3478 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3481 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3483 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3488 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3490 DPSOFTRAST_VertexShader_LightDirection();
3491 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3494 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3496 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3501 void DPSOFTRAST_VertexShader_LightDirection(void)
3504 int numvertices = dpsoftrast.numvertices;
3506 float LightVector[4];
3507 float EyePosition[4];
3508 float EyeVectorModelSpace[4];
3514 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3515 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3516 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3517 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3518 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3519 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3520 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3521 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3522 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3523 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3524 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3525 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3526 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3527 for (i = 0;i < numvertices;i++)
3529 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3530 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3531 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3532 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3533 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3534 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3535 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3536 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3537 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3538 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3539 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3540 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3541 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3542 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3543 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3544 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3545 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3546 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3547 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3548 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3549 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3550 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3551 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3552 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3553 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3554 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3555 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3556 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3557 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3559 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3562 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3563 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3564 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3565 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3566 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3567 #define DPSOFTRAST_Vector3Normalize(v)\
3570 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3581 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3583 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3584 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3585 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3586 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3587 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3588 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3589 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3590 unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3591 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3592 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3593 int x, startx = span->startx, endx = span->endx;
3594 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3595 float LightVectordata[4];
3596 float LightVectorslope[4];
3597 float EyeVectordata[4];
3598 float EyeVectorslope[4];
3599 float VectorSdata[4];
3600 float VectorSslope[4];
3601 float VectorTdata[4];
3602 float VectorTslope[4];
3603 float VectorRdata[4];
3604 float VectorRslope[4];
3606 float diffusetex[4];
3608 float surfacenormal[4];
3609 float lightnormal[4];
3610 float lightnormal_modelspace[4];
3612 float specularnormal[4];
3615 float SpecularPower;
3617 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3618 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3619 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3620 Color_Glow[3] = 0.0f;
3621 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3622 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3623 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3624 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3625 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3626 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3627 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3628 Color_Pants[3] = 0.0f;
3629 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3630 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3631 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3632 Color_Shirt[3] = 0.0f;
3633 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3634 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3635 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3637 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3638 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3640 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3642 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3644 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3646 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3647 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3648 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3649 Color_Diffuse[3] = 0.0f;
3650 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3651 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3652 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3653 LightColor[3] = 0.0f;
3654 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3655 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3656 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3657 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3658 Color_Specular[3] = 0.0f;
3659 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3660 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3661 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3663 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3665 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3666 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3667 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3668 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3669 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3671 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3673 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3674 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3676 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3678 // nothing of this needed
3682 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3685 for (x = startx;x < endx;x++)
3688 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3689 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3690 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3691 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3692 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3694 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3695 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3696 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3697 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3699 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3700 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3701 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3702 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3703 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3704 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3705 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3706 DPSOFTRAST_Vector3Normalize(surfacenormal);
3708 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3710 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3711 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3712 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3713 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3715 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3716 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3717 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3718 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3720 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3721 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3722 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3723 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3725 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3726 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3727 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3728 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3730 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3731 DPSOFTRAST_Vector3Normalize(lightnormal);
3733 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3735 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3736 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3737 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3738 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3741 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3743 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3744 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3745 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3747 float f = 1.0f / 256.0f;
3748 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3749 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3750 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3753 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3755 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3756 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3757 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3758 DPSOFTRAST_Vector3Normalize(lightnormal);
3760 LightColor[0] = 1.0;
3761 LightColor[1] = 1.0;
3762 LightColor[2] = 1.0;
3766 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3767 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3768 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3769 DPSOFTRAST_Vector3Normalize(lightnormal);
3772 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3774 if(thread->shader_exactspecularmath)
3776 // reflect lightnormal at surfacenormal, take the negative of that
3777 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3779 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3780 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3781 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3782 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3784 // dot of this and normalize(EyeVectorFogDepth.xyz)
3785 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3786 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3787 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3788 DPSOFTRAST_Vector3Normalize(eyenormal);
3790 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3794 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3795 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3796 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3797 DPSOFTRAST_Vector3Normalize(eyenormal);
3799 specularnormal[0] = lightnormal[0] + eyenormal[0];
3800 specularnormal[1] = lightnormal[1] + eyenormal[1];
3801 specularnormal[2] = lightnormal[2] + eyenormal[2];
3802 DPSOFTRAST_Vector3Normalize(specularnormal);
3804 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3807 specular = pow(specular, SpecularPower * glosstex[3]);
3808 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3810 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3811 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3812 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3813 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3817 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3818 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3819 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3820 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3823 buffer_FragColorbgra8[x*4+0] = d[0];
3824 buffer_FragColorbgra8[x*4+1] = d[1];
3825 buffer_FragColorbgra8[x*4+2] = d[2];
3826 buffer_FragColorbgra8[x*4+3] = d[3];
3829 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3831 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3832 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3833 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3834 Color_Diffuse[3] = 0.0f;
3835 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3836 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3837 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3838 LightColor[3] = 0.0f;
3839 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3841 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3843 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3844 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3845 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3846 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3847 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3849 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3851 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3852 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3854 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3856 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3860 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3863 for (x = startx;x < endx;x++)
3866 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3867 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3868 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3869 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3870 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3871 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3872 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3873 DPSOFTRAST_Vector3Normalize(surfacenormal);
3875 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3877 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3878 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3879 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3880 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3882 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3883 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3884 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3885 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3887 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3888 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3889 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3890 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3892 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3893 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3894 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3895 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3897 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3898 DPSOFTRAST_Vector3Normalize(lightnormal);
3900 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3902 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3903 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3904 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3905 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3908 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3910 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3911 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3912 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3914 float f = 1.0f / 256.0f;
3915 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3916 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3917 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3920 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3922 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3923 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3924 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3925 DPSOFTRAST_Vector3Normalize(lightnormal);
3927 LightColor[0] = 1.0;
3928 LightColor[1] = 1.0;
3929 LightColor[2] = 1.0;
3933 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3934 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3935 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3936 DPSOFTRAST_Vector3Normalize(lightnormal);
3939 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3940 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3942 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3943 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3944 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3945 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3949 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3950 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3951 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3952 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3954 buffer_FragColorbgra8[x*4+0] = d[0];
3955 buffer_FragColorbgra8[x*4+1] = d[1];
3956 buffer_FragColorbgra8[x*4+2] = d[2];
3957 buffer_FragColorbgra8[x*4+3] = d[3];
3962 for (x = startx;x < endx;x++)
3965 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3966 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3967 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3968 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3970 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3972 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3973 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3974 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3975 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3979 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3980 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3981 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3982 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3984 buffer_FragColorbgra8[x*4+0] = d[0];
3985 buffer_FragColorbgra8[x*4+1] = d[1];
3986 buffer_FragColorbgra8[x*4+2] = d[2];
3987 buffer_FragColorbgra8[x*4+3] = d[3];
3990 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3995 void DPSOFTRAST_VertexShader_LightSource(void)
3998 int numvertices = dpsoftrast.numvertices;
3999 float LightPosition[4];
4000 float LightVector[4];
4001 float LightVectorModelSpace[4];
4002 float EyePosition[4];
4003 float EyeVectorModelSpace[4];
4009 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4010 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4011 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4012 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4013 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4014 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4015 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4016 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4017 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4018 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4019 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4020 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4021 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4022 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4023 for (i = 0;i < numvertices;i++)
4025 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4026 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4027 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4028 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4029 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4030 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4031 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4032 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4033 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4034 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4035 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4036 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4037 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4038 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4039 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4040 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4041 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4042 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
4043 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4044 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4045 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4046 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4047 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4048 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4049 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4050 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4051 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4052 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4053 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4054 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4055 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4056 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4058 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4059 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4062 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4065 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4066 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4067 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4068 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4069 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4070 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4071 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4072 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4073 int x, startx = span->startx, endx = span->endx;
4074 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4075 float CubeVectordata[4];
4076 float CubeVectorslope[4];
4077 float LightVectordata[4];
4078 float LightVectorslope[4];
4079 float EyeVectordata[4];
4080 float EyeVectorslope[4];
4082 float diffusetex[4];
4084 float surfacenormal[4];
4085 float lightnormal[4];
4087 float specularnormal[4];
4090 float SpecularPower;
4091 float CubeVector[4];
4094 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4095 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4096 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4097 Color_Glow[3] = 0.0f;
4098 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4099 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4100 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4101 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4102 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4103 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4104 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4105 Color_Diffuse[3] = 0.0f;
4106 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4107 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4108 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4109 Color_Specular[3] = 0.0f;
4110 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4111 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4112 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4113 Color_Pants[3] = 0.0f;
4114 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4115 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4116 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4117 Color_Shirt[3] = 0.0f;
4118 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4119 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4120 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4121 LightColor[3] = 0.0f;
4122 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4123 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4124 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4125 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4126 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4127 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4128 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4129 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4131 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4132 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4134 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4135 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4136 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4138 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4139 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4140 for (x = startx;x < endx;x++)
4143 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4144 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4145 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4146 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4147 if (attenuation < 0.01f)
4149 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4151 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4152 if (attenuation < 0.01f)
4156 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4157 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4158 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4159 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4160 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4162 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4163 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4164 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4165 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4167 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4168 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4169 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4170 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4171 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4172 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4173 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4174 DPSOFTRAST_Vector3Normalize(surfacenormal);
4176 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4177 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4178 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4179 DPSOFTRAST_Vector3Normalize(lightnormal);
4181 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4183 if(thread->shader_exactspecularmath)
4185 // reflect lightnormal at surfacenormal, take the negative of that
4186 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4188 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4189 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4190 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4191 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4193 // dot of this and normalize(EyeVectorFogDepth.xyz)
4194 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4195 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4196 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4197 DPSOFTRAST_Vector3Normalize(eyenormal);
4199 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4203 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4204 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4205 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4206 DPSOFTRAST_Vector3Normalize(eyenormal);
4208 specularnormal[0] = lightnormal[0] + eyenormal[0];
4209 specularnormal[1] = lightnormal[1] + eyenormal[1];
4210 specularnormal[2] = lightnormal[2] + eyenormal[2];
4211 DPSOFTRAST_Vector3Normalize(specularnormal);
4213 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4215 specular = pow(specular, SpecularPower * glosstex[3]);
4217 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4219 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4220 attenuation *= (1.0f / 255.0f);
4221 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4222 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4223 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4224 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4228 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4229 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4230 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4231 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4233 buffer_FragColorbgra8[x*4+0] = d[0];
4234 buffer_FragColorbgra8[x*4+1] = d[1];
4235 buffer_FragColorbgra8[x*4+2] = d[2];
4236 buffer_FragColorbgra8[x*4+3] = d[3];
4239 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4241 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4242 for (x = startx;x < endx;x++)
4245 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4246 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4247 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4248 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4249 if (attenuation < 0.01f)
4251 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4253 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4254 if (attenuation < 0.01f)
4258 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4259 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4260 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4261 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4262 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4264 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4265 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4266 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4267 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4269 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4270 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4271 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4272 DPSOFTRAST_Vector3Normalize(surfacenormal);
4274 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4275 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4276 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4277 DPSOFTRAST_Vector3Normalize(lightnormal);
4279 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4280 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4282 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4283 attenuation *= (1.0f / 255.0f);
4284 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4285 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4286 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4287 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4291 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4292 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4293 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4294 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4296 buffer_FragColorbgra8[x*4+0] = d[0];
4297 buffer_FragColorbgra8[x*4+1] = d[1];
4298 buffer_FragColorbgra8[x*4+2] = d[2];
4299 buffer_FragColorbgra8[x*4+3] = d[3];
4304 for (x = startx;x < endx;x++)
4307 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4308 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4309 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4310 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4311 if (attenuation < 0.01f)
4313 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4315 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4316 if (attenuation < 0.01f)
4320 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4321 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4322 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4323 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4324 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4326 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4327 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4328 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4329 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4331 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4333 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4334 attenuation *= (1.0f / 255.0f);
4335 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4336 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4337 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4338 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4342 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4343 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4344 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4345 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4347 buffer_FragColorbgra8[x*4+0] = d[0];
4348 buffer_FragColorbgra8[x*4+1] = d[1];
4349 buffer_FragColorbgra8[x*4+2] = d[2];
4350 buffer_FragColorbgra8[x*4+3] = d[3];
4353 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4359 void DPSOFTRAST_VertexShader_Refraction(void)
4361 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4362 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4363 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4366 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4368 // DIRTY TRICK: only do sideways displacement. Not correct, but cheaper and thus better for SW.
4370 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4372 int x, startx = span->startx, endx = span->endx;
4375 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4376 //unsigned char buffer_texture_refractionbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4377 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4380 float ModelViewProjectionPositiondata[4];
4381 float ModelViewProjectionPositionslope[4];
4384 float ScreenScaleRefractReflect[2];
4385 float ScreenCenterRefractReflect[2];
4386 float DistortScaleRefractReflect[2];
4387 float RefractColor[4];
4389 const unsigned char * RESTRICT pixelbase;
4390 const unsigned char * RESTRICT pixel[4];
4391 DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4392 if(!texture) return;
4393 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[0][0];
4396 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4397 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4398 //DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_refractionbgra8, GL20TU_REFRACTION, DPSOFTRAST_ARRAY_TEXCOORD1, buffer_z);
4401 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
4404 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4405 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4406 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4407 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4408 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4409 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4410 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4411 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4412 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4413 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4416 for (x = startx;x < endx;x++)
4418 float SafeScreenTexCoord[2];
4419 float ScreenTexCoord[2];
4426 // " vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4427 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4429 // " vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4430 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4431 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4433 // " vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4434 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4435 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4436 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4437 DPSOFTRAST_Vector3Normalize(v);
4438 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4439 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4441 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4442 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4444 unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<16) - 32768, ScreenTexCoord[1] * (texture->mipmap[0][3]<<16) - 32678};
4445 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
4446 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
4447 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
4448 int tci[2] = { tc[0]>>16, tc[1]>>16 };
4449 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4450 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4451 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4452 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4453 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4454 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4455 pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci1[0]);
4456 pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci[0]);
4457 pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci1[0]);
4458 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
4459 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
4460 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
4464 int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2] - 0.5, ScreenTexCoord[1] * texture->mipmap[0][3] - 0.5 };
4465 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4466 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4467 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4468 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4469 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4470 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4476 //p = (int) bound(startx, x + (ScreenTexCoord[0] - SafeScreenTexCoord[0]) / (ModelViewProjectionPositionslope[0]*z), endx-1);
4477 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4478 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4479 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4480 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4483 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4488 void DPSOFTRAST_VertexShader_Water(void)
4490 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4494 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4497 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4498 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4499 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4500 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4501 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4506 void DPSOFTRAST_VertexShader_ShowDepth(void)
4508 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4511 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4514 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4515 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4516 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4517 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4518 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4523 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4525 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4528 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4531 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4532 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4533 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4534 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4535 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4540 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4542 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4545 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4548 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4549 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4550 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4551 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4552 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4557 typedef struct DPSOFTRAST_ShaderModeInfo_s
4560 void (*Vertex)(void);
4561 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4562 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4563 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4565 DPSOFTRAST_ShaderModeInfo;
4567 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4569 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4570 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4571 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4572 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4573 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4574 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4575 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4576 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4577 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4578 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4579 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4580 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4581 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4582 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4583 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4584 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4587 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4594 // unsigned int *colorpixel;
4595 unsigned int *depthpixel;
4601 DPSOFTRAST_State_Triangle *triangle;
4602 DPSOFTRAST_State_Span *span;
4603 unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4604 for (i = 0; i < thread->numspans; i++)
4606 span = &thread->spans[i];
4607 triangle = &thread->triangles[span->triangle];
4608 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4610 wslope = triangle->w[0];
4611 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4612 depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4613 depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4614 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4615 startx = span->startx;
4617 switch(thread->fb_depthfunc)
4620 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4621 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4622 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4623 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4624 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4625 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4626 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4628 //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4629 //for (x = startx;x < endx;x++)
4630 // colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4631 // if there is no color buffer, skip pixel shader
4632 while (startx < endx && !pixelmask[startx])
4634 while (endx > startx && !pixelmask[endx-1])
4637 continue; // no pixels to fill
4638 span->pixelmask = pixelmask;
4639 span->startx = startx;
4641 // run pixel shader if appropriate
4642 // do this before running depthmask code, to allow the pixelshader
4643 // to clear pixelmask values for alpha testing
4644 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4645 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4646 if (thread->depthmask)
4647 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4653 // no depth testing means we're just dealing with color...
4654 // if there is no color buffer, skip pixel shader
4655 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4657 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4658 span->pixelmask = pixelmask;
4659 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4663 thread->numspans = 0;
4666 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4668 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4671 int cullface = thread->cullface;
4672 int minx, maxx, miny, maxy;
4673 int miny1, maxy1, miny2, maxy2;
4674 __m128i fbmin, fbmax;
4675 __m128 viewportcenter, viewportscale;
4676 int firstvertex = command->firstvertex;
4677 int numvertices = command->numvertices;
4678 int numtriangles = command->numtriangles;
4679 const int *element3i = command->element3i;
4680 const unsigned short *element3s = command->element3s;
4681 int clipped = command->clipped;
4688 int starty, endy, bandy;
4692 __m128 triangleedge1, triangleedge2, trianglenormal;
4695 DPSOFTRAST_State_Triangle *triangle;
4696 DPSOFTRAST_Texture *texture;
4697 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4698 miny = thread->fb_scissor[1];
4699 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4700 miny1 = bound(miny, thread->miny1, maxy);
4701 maxy1 = bound(miny, thread->maxy1, maxy);
4702 miny2 = bound(miny, thread->miny2, maxy);
4703 maxy2 = bound(miny, thread->maxy2, maxy);
4704 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4706 if (!ATOMIC_DECREMENT(command->refcount))
4708 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4709 MM_FREE(command->arrays);
4713 minx = thread->fb_scissor[0];
4714 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4715 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4716 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4717 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4718 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4719 screen[3] = _mm_setzero_ps();
4720 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4721 for (i = 0;i < numtriangles;i++)
4723 const float *screencoord4f = command->arrays;
4724 const float *arrays = screencoord4f + numvertices*4;
4726 // generate the 3 edges of this triangle
4727 // generate spans for the triangle - switch based on left split or right split classification of triangle
4730 e[0] = element3s[i*3+0] - firstvertex;
4731 e[1] = element3s[i*3+1] - firstvertex;
4732 e[2] = element3s[i*3+2] - firstvertex;
4736 e[0] = element3i[i*3+0] - firstvertex;
4737 e[1] = element3i[i*3+1] - firstvertex;
4738 e[2] = element3i[i*3+2] - firstvertex;
4747 #define SKIPBACKFACE \
4748 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4749 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4750 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4751 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4752 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4756 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4760 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4765 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4766 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4768 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4769 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4771 #define CLIPPEDVERTEXCOPY(k,p1) \
4772 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4774 #define GENATTRIBCOPY(attrib, p1) \
4775 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4776 #define GENATTRIBLERP(attrib, p1, p2) \
4778 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4779 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4781 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4785 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4786 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4787 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4788 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4789 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4790 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4791 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4797 // calculate distance from nearplane
4798 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4799 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4800 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4801 if (clipdist[0] >= 0.0f)
4803 if (clipdist[1] >= 0.0f)
4805 if (clipdist[2] >= 0.0f)
4808 // triangle is entirely in front of nearplane
4809 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4816 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4824 if (clipdist[2] >= 0.0f)
4826 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4833 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4840 else if (clipdist[1] >= 0.0f)
4842 if (clipdist[2] >= 0.0f)
4844 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4851 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4857 else if (clipdist[2] >= 0.0f)
4859 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4864 else continue; // triangle is entirely behind nearplane
4867 // calculate integer y coords for triangle points
4868 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4869 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4870 screenmin = _mm_min_epi16(screeni, screenir),
4871 screenmax = _mm_max_epi16(screeni, screenir);
4872 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4873 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4874 screenmin = _mm_max_epi16(screenmin, fbmin);
4875 screenmax = _mm_min_epi16(screenmax, fbmax);
4876 // skip offscreen triangles
4877 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4879 starty = _mm_extract_epi16(screenmin, 1);
4880 endy = _mm_extract_epi16(screenmax, 1)+1;
4881 if (starty >= maxy1 && endy <= miny2)
4883 screeny = _mm_srai_epi32(screeni, 16);
4886 triangle = &thread->triangles[thread->numtriangles];
4888 // calculate attribute plans for triangle data...
4889 // okay, this triangle is going to produce spans, we'd better project
4890 // the interpolants now (this is what gives perspective texturing),
4891 // this consists of simply multiplying all arrays by the W coord
4892 // (which is basically 1/Z), which will be undone per-pixel
4893 // (multiplying by Z again) to get the perspective-correct array
4896 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4897 __m128 mipedgescale, mipdensity;
4898 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4899 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4900 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4901 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4902 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4903 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4904 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4905 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4906 attribedge1 = _mm_sub_ss(w0, w1);
4907 attribedge2 = _mm_sub_ss(w2, w1);
4908 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4909 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4910 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4911 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4912 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4913 _mm_store_ss(&triangle->w[0], attribxslope);
4914 _mm_store_ss(&triangle->w[1], attribyslope);
4915 _mm_store_ss(&triangle->w[2], attriborigin);
4916 mipedgescale = _mm_setzero_ps();
4917 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4919 __m128 attrib0, attrib1, attrib2;
4920 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4921 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4923 arrays += numvertices*4;
4924 GENATTRIBS(attrib0, attrib1, attrib2);
4925 attriborigin = _mm_mul_ps(attrib1, w1);
4926 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4927 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4928 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4929 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4930 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4931 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
4932 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
4933 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
4934 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4936 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4937 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4938 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4939 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4943 memset(triangle->mip, 0, sizeof(triangle->mip));
4944 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4946 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4947 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4949 texture = thread->texbound[texunit];
4950 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4952 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4953 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4954 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4955 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4956 // this will be multiplied in the texturing routine by the texture resolution
4957 y = _mm_cvtss_si32(mipdensity);
4960 y = (int)(log((float)y)*0.5f/M_LN2);
4961 if (y > texture->mipmaps - 1)
4962 y = texture->mipmaps - 1;
4963 triangle->mip[texunit] = y;
4969 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4972 __m128 xcoords, xslope;
4973 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4974 int yccmask = _mm_movemask_epi8(ycc);
4975 int edge0p, edge0n, edge1p, edge1n;
4982 case 0xFFFF: /*0000*/ y = endy; continue;
4983 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4984 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4985 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4986 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4987 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4988 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4989 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4990 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4991 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4992 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4993 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4994 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4995 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4996 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4997 case 0x0000: /*1111*/ y++; continue;
5005 case 0xFFFF: /*000*/ y = endy; continue;
5006 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5007 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5008 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5009 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5010 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5011 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5012 case 0x0000: /*111*/ y++; continue;
5015 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5016 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5017 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5018 nexty = _mm_extract_epi16(ycc, 0);
5019 if (nexty >= bandy) nexty = bandy-1;
5020 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5021 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5022 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5023 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5024 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5025 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5027 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5028 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5030 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
5032 int startx, endx, offset;
5033 startx = _mm_cvtss_si32(xcoords);
5034 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5037 if (startx < 0) startx = 0;
5038 startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
5040 if (endx > maxx) endx = maxx;
5041 if (startx >= endx) continue;
5042 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5044 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5045 span->triangle = thread->numtriangles;
5048 span->startx = max(minx - offset, 0);
5049 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5050 if (span->startx >= span->endx)
5052 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5053 DPSOFTRAST_Draw_ProcessSpans(thread);
5058 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5060 DPSOFTRAST_Draw_ProcessSpans(thread);
5061 thread->numtriangles = 0;
5065 if (!ATOMIC_DECREMENT(command->refcount))
5067 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5068 MM_FREE(command->arrays);
5071 if (thread->numspans > 0 || thread->numtriangles > 0)
5073 DPSOFTRAST_Draw_ProcessSpans(thread);
5074 thread->numtriangles = 0;
5079 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5083 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5084 int datasize = 2*numvertices*sizeof(float[4]);
5085 DPSOFTRAST_Command_Draw *command;
5086 unsigned char *data;
5087 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5089 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5090 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5092 datasize += numvertices*sizeof(float[4]);
5095 datasize += numtriangles*sizeof(unsigned short[3]);
5097 datasize += numtriangles*sizeof(int[3]);
5098 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5099 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5101 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5102 data = (unsigned char *)MM_CALLOC(datasize, 1);
5106 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5107 data = (unsigned char *)command + commandsize;
5109 command->firstvertex = firstvertex;
5110 command->numvertices = numvertices;
5111 command->numtriangles = numtriangles;
5112 command->arrays = (float *)data;
5113 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5114 dpsoftrast.firstvertex = firstvertex;
5115 dpsoftrast.numvertices = numvertices;
5116 dpsoftrast.screencoord4f = (float *)data;
5117 data += numvertices*sizeof(float[4]);
5118 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5119 data += numvertices*sizeof(float[4]);
5120 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5122 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5123 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5125 dpsoftrast.post_array4f[j] = (float *)data;
5126 data += numvertices*sizeof(float[4]);
5128 command->element3i = NULL;
5129 command->element3s = NULL;
5132 command->element3s = (unsigned short *)data;
5133 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5137 command->element3i = (int *)data;
5138 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5143 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5145 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5146 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5147 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5148 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5149 if (command->starty >= command->endy)
5151 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5152 MM_FREE(command->arrays);
5153 DPSOFTRAST_UndoCommand(command->commandsize);
5156 command->clipped = dpsoftrast.drawclipped;
5157 command->refcount = dpsoftrast.numthreads;
5159 if (dpsoftrast.usethreads)
5162 DPSOFTRAST_Draw_SyncCommands();
5163 for (i = 0; i < dpsoftrast.numthreads; i++)
5165 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5166 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5167 Thread_CondSignal(thread->drawcond);
5172 DPSOFTRAST_Draw_FlushThreads();
5176 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5177 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5179 thread->validate |= DPSOFTRAST_VALIDATE_FB;
5181 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5183 DPSOFTRAST_Command_SetRenderTargets *command;
5184 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5185 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5186 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5188 dpsoftrast.fb_width = width;
5189 dpsoftrast.fb_height = height;
5190 dpsoftrast.fb_depthpixels = depthpixels;
5191 dpsoftrast.fb_colorpixels[0] = colorpixels0;
5192 dpsoftrast.fb_colorpixels[1] = colorpixels1;
5193 dpsoftrast.fb_colorpixels[2] = colorpixels2;
5194 dpsoftrast.fb_colorpixels[3] = colorpixels3;
5195 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5196 command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5197 command->width = width;
5198 command->height = height;
5201 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5203 int commandoffset = thread->commandoffset;
5204 while (commandoffset != endoffset)
5206 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5207 switch (command->opcode)
5209 #define INTERPCOMMAND(name) \
5210 case DPSOFTRAST_OPCODE_##name : \
5211 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5212 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5213 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5214 commandoffset = 0; \
5216 INTERPCOMMAND(Viewport)
5217 INTERPCOMMAND(ClearColor)
5218 INTERPCOMMAND(ClearDepth)
5219 INTERPCOMMAND(ColorMask)
5220 INTERPCOMMAND(DepthTest)
5221 INTERPCOMMAND(ScissorTest)
5222 INTERPCOMMAND(Scissor)
5223 INTERPCOMMAND(BlendFunc)
5224 INTERPCOMMAND(BlendSubtract)
5225 INTERPCOMMAND(DepthMask)
5226 INTERPCOMMAND(DepthFunc)
5227 INTERPCOMMAND(DepthRange)
5228 INTERPCOMMAND(PolygonOffset)
5229 INTERPCOMMAND(CullFace)
5230 INTERPCOMMAND(AlphaTest)
5231 INTERPCOMMAND(AlphaFunc)
5232 INTERPCOMMAND(SetTexture)
5233 INTERPCOMMAND(SetShader)
5234 INTERPCOMMAND(Uniform4f)
5235 INTERPCOMMAND(UniformMatrix4f)
5236 INTERPCOMMAND(Uniform1i)
5237 INTERPCOMMAND(SetRenderTargets)
5239 case DPSOFTRAST_OPCODE_Draw:
5240 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5241 commandoffset += command->commandsize;
5242 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5244 thread->commandoffset = commandoffset;
5247 case DPSOFTRAST_OPCODE_Reset:
5252 thread->commandoffset = commandoffset;
5255 static int DPSOFTRAST_Draw_Thread(void *data)
5257 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5258 while(thread->index >= 0)
5260 if (thread->commandoffset != dpsoftrast.drawcommand)
5262 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5266 Thread_LockMutex(thread->drawmutex);
5267 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5269 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5270 thread->starving = true;
5271 Thread_CondWait(thread->drawcond, thread->drawmutex);
5272 thread->starving = false;
5274 Thread_UnlockMutex(thread->drawmutex);
5280 static void DPSOFTRAST_Draw_FlushThreads(void)
5282 DPSOFTRAST_State_Thread *thread;
5284 DPSOFTRAST_Draw_SyncCommands();
5285 if (dpsoftrast.usethreads)
5287 for (i = 0; i < dpsoftrast.numthreads; i++)
5289 thread = &dpsoftrast.threads[i];
5290 if (thread->commandoffset != dpsoftrast.drawcommand)
5292 Thread_LockMutex(thread->drawmutex);
5293 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5294 Thread_CondSignal(thread->drawcond);
5295 Thread_UnlockMutex(thread->drawmutex);
5298 for (i = 0; i < dpsoftrast.numthreads; i++)
5300 thread = &dpsoftrast.threads[i];
5301 if (thread->commandoffset != dpsoftrast.drawcommand)
5303 Thread_LockMutex(thread->drawmutex);
5304 if (thread->commandoffset != dpsoftrast.drawcommand)
5306 thread->waiting = true;
5307 Thread_CondWait(thread->waitcond, thread->drawmutex);
5308 thread->waiting = false;
5310 Thread_UnlockMutex(thread->drawmutex);
5316 for (i = 0; i < dpsoftrast.numthreads; i++)
5318 thread = &dpsoftrast.threads[i];
5319 if (thread->commandoffset != dpsoftrast.drawcommand)
5320 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5323 dpsoftrast.commandpool.usedcommands = 0;
5326 void DPSOFTRAST_Flush(void)
5328 DPSOFTRAST_Draw_FlushThreads();
5331 void DPSOFTRAST_Finish(void)
5336 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5346 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5347 dpsoftrast.bigendian = u.b[3];
5348 dpsoftrast.fb_width = width;
5349 dpsoftrast.fb_height = height;
5350 dpsoftrast.fb_depthpixels = depthpixels;
5351 dpsoftrast.fb_colorpixels[0] = colorpixels;
5352 dpsoftrast.fb_colorpixels[1] = NULL;
5353 dpsoftrast.fb_colorpixels[1] = NULL;
5354 dpsoftrast.fb_colorpixels[1] = NULL;
5355 dpsoftrast.viewport[0] = 0;
5356 dpsoftrast.viewport[1] = 0;
5357 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5358 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5359 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5360 dpsoftrast.texture_firstfree = 1;
5361 dpsoftrast.texture_end = 1;
5362 dpsoftrast.texture_max = 0;
5363 dpsoftrast.color[0] = 1;
5364 dpsoftrast.color[1] = 1;
5365 dpsoftrast.color[2] = 1;
5366 dpsoftrast.color[3] = 1;
5367 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5368 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5369 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5370 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5371 for (i = 0; i < dpsoftrast.numthreads; i++)
5373 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5375 thread->cullface = GL_BACK;
5376 thread->colormask[1] = 1;
5377 thread->colormask[2] = 1;
5378 thread->colormask[3] = 1;
5379 thread->blendfunc[0] = GL_ONE;
5380 thread->blendfunc[1] = GL_ZERO;
5381 thread->depthmask = true;
5382 thread->depthtest = true;
5383 thread->depthfunc = GL_LEQUAL;
5384 thread->scissortest = false;
5385 thread->alphatest = false;
5386 thread->alphafunc = GL_GREATER;
5387 thread->alphavalue = 0.5f;
5388 thread->viewport[0] = 0;
5389 thread->viewport[1] = 0;
5390 thread->viewport[2] = dpsoftrast.fb_width;
5391 thread->viewport[3] = dpsoftrast.fb_height;
5392 thread->scissor[0] = 0;
5393 thread->scissor[1] = 0;
5394 thread->scissor[2] = dpsoftrast.fb_width;
5395 thread->scissor[3] = dpsoftrast.fb_height;
5396 thread->depthrange[0] = 0;
5397 thread->depthrange[1] = 1;
5398 thread->polygonoffset[0] = 0;
5399 thread->polygonoffset[1] = 0;
5401 DPSOFTRAST_RecalcThread(thread);
5403 thread->numspans = 0;
5404 thread->numtriangles = 0;
5405 thread->commandoffset = 0;
5406 thread->waiting = false;
5407 thread->starving = false;
5409 thread->validate = -1;
5410 DPSOFTRAST_Validate(thread, -1);
5412 if (dpsoftrast.usethreads)
5414 thread->waitcond = Thread_CreateCond();
5415 thread->drawcond = Thread_CreateCond();
5416 thread->drawmutex = Thread_CreateMutex();
5417 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5423 void DPSOFTRAST_Shutdown(void)
5426 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5428 DPSOFTRAST_State_Thread *thread;
5429 for (i = 0; i < dpsoftrast.numthreads; i++)
5431 thread = &dpsoftrast.threads[i];
5432 Thread_LockMutex(thread->drawmutex);
5434 Thread_CondSignal(thread->drawcond);
5435 Thread_UnlockMutex(thread->drawmutex);
5436 Thread_WaitThread(thread->thread, 0);
5437 Thread_DestroyCond(thread->waitcond);
5438 Thread_DestroyCond(thread->drawcond);
5439 Thread_DestroyMutex(thread->drawmutex);
5442 for (i = 0;i < dpsoftrast.texture_end;i++)
5443 if (dpsoftrast.texture[i].bytes)
5444 MM_FREE(dpsoftrast.texture[i].bytes);
5445 if (dpsoftrast.texture)
5446 free(dpsoftrast.texture);
5447 if (dpsoftrast.threads)
5448 MM_FREE(dpsoftrast.threads);
5449 memset(&dpsoftrast, 0, sizeof(dpsoftrast));