3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 #pragma warning(disable : 4324)
14 typedef qboolean bool;
18 #define ATOMIC_SIZE 32
21 #if defined(__APPLE__)
22 #include <libkern/OSAtomic.h>
23 #define ALIGN(var) var __attribute__((__aligned__(16)))
24 #define ATOMIC(var) var __attribute__((__aligned__(32)))
25 #define MEMORY_BARRIER (_mm_sfence())
26 #define ATOMIC_COUNTER volatile int32_t
27 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30 #elif defined(__GNUC__)
31 #define ALIGN(var) var __attribute__((__aligned__(16)))
32 #define ATOMIC(var) var __attribute__((__aligned__(32)))
33 #define MEMORY_BARRIER (_mm_sfence())
34 //(__sync_synchronize())
35 #define ATOMIC_COUNTER volatile int
36 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
37 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
38 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
39 #elif defined(_MSC_VER)
40 #define ALIGN(var) __declspec(align(16)) var
41 #define ATOMIC(var) __declspec(align(32)) var
42 #define MEMORY_BARRIER (_mm_sfence())
44 #define ATOMIC_COUNTER volatile LONG
45 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
46 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
47 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
52 #define ALIGN(var) var
55 #define ATOMIC(var) var
57 #ifndef MEMORY_BARRIER
58 #define MEMORY_BARRIER ((void)0)
60 #ifndef ATOMIC_COUNTER
61 #define ATOMIC_COUNTER int
63 #ifndef ATOMIC_INCREMENT
64 #define ATOMIC_INCREMENT(counter) (++(counter))
66 #ifndef ATOMIC_DECREMENT
67 #define ATOMIC_DECREMENT(counter) (--(counter))
70 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
74 #include <emmintrin.h>
76 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
78 static void *MM_CALLOC(size_t nmemb, size_t size)
80 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
81 if (ptr != NULL) memset(ptr, 0, nmemb*size);
85 #define MM_FREE _mm_free
87 #define MM_MALLOC(size) malloc(size)
88 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
92 typedef enum DPSOFTRAST_ARRAY_e
94 DPSOFTRAST_ARRAY_POSITION,
95 DPSOFTRAST_ARRAY_COLOR,
96 DPSOFTRAST_ARRAY_TEXCOORD0,
97 DPSOFTRAST_ARRAY_TEXCOORD1,
98 DPSOFTRAST_ARRAY_TEXCOORD2,
99 DPSOFTRAST_ARRAY_TEXCOORD3,
100 DPSOFTRAST_ARRAY_TEXCOORD4,
101 DPSOFTRAST_ARRAY_TEXCOORD5,
102 DPSOFTRAST_ARRAY_TEXCOORD6,
103 DPSOFTRAST_ARRAY_TEXCOORD7,
104 DPSOFTRAST_ARRAY_TOTAL
108 typedef struct DPSOFTRAST_Texture_s
115 DPSOFTRAST_TEXTURE_FILTER filter;
118 ATOMIC_COUNTER binds;
119 unsigned char *bytes;
120 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
124 #define COMMAND_SIZE ALIGN_SIZE
125 #define COMMAND_ALIGN(var) ALIGN(var)
127 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
129 unsigned char opcode;
130 unsigned short commandsize;
134 enum { DPSOFTRAST_OPCODE_Reset = 0 };
136 #define DEFCOMMAND(opcodeval, name, fields) \
137 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
138 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
140 unsigned char opcode; \
141 unsigned short commandsize; \
143 } DPSOFTRAST_Command_##name );
145 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
146 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
148 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
152 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
154 DPSOFTRAST_State_Command_Pool);
156 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
158 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
160 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
162 DPSOFTRAST_State_Triangle);
164 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
165 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
166 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
167 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
168 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
170 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
171 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
172 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
173 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
174 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
175 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
176 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
177 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
178 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
181 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
183 typedef ALIGN(struct DPSOFTRAST_State_Span_s
185 int triangle; // triangle this span was generated by
186 int x; // framebuffer x coord
187 int y; // framebuffer y coord
188 int startx; // usable range (according to pixelmask)
189 int endx; // usable range (according to pixelmask)
190 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
191 int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
192 int depthslope; // depthbuffer value pixel delta
194 DPSOFTRAST_State_Span);
196 #define DPSOFTRAST_DRAW_MAXSPANS 1024
197 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
198 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
200 #define DPSOFTRAST_VALIDATE_FB 1
201 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
202 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
203 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
205 typedef enum DPSOFTRAST_BLENDMODE_e
207 DPSOFTRAST_BLENDMODE_OPAQUE,
208 DPSOFTRAST_BLENDMODE_ALPHA,
209 DPSOFTRAST_BLENDMODE_ADDALPHA,
210 DPSOFTRAST_BLENDMODE_ADD,
211 DPSOFTRAST_BLENDMODE_INVMOD,
212 DPSOFTRAST_BLENDMODE_MUL,
213 DPSOFTRAST_BLENDMODE_MUL2,
214 DPSOFTRAST_BLENDMODE_SUBALPHA,
215 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
216 DPSOFTRAST_BLENDMODE_INVADD,
217 DPSOFTRAST_BLENDMODE_TOTAL
219 DPSOFTRAST_BLENDMODE;
221 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
240 float polygonoffset[2];
242 ALIGN(float fb_clipplane[4]);
245 int shader_permutation;
246 int shader_exactspecularmath;
248 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
250 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
251 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
253 // DPSOFTRAST_VALIDATE_ flags
256 // derived values (DPSOFTRAST_VALIDATE_FB)
259 ALIGN(float fb_viewportcenter[4]);
260 ALIGN(float fb_viewportscale[4]);
262 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
265 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
274 ATOMIC(volatile int commandoffset);
276 volatile bool waiting;
277 volatile bool starving;
284 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
285 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
286 unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
288 DPSOFTRAST_State_Thread);
290 typedef ATOMIC(struct DPSOFTRAST_State_s
294 unsigned int *fb_depthpixels;
295 unsigned int *fb_colorpixels[4];
298 ALIGN(float fb_viewportcenter[4]);
299 ALIGN(float fb_viewportscale[4]);
302 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
303 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
305 const float *pointer_vertex3f;
306 const float *pointer_color4f;
307 const unsigned char *pointer_color4ub;
308 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
311 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
312 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
313 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
317 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
318 float *screencoord4f;
324 int shader_permutation;
325 int shader_exactspecularmath;
329 int texture_firstfree;
330 DPSOFTRAST_Texture *texture;
335 const char *errorstring;
340 DPSOFTRAST_State_Thread *threads;
342 ATOMIC(volatile int drawcommand);
344 DPSOFTRAST_State_Command_Pool commandpool;
348 DPSOFTRAST_State dpsoftrast;
350 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
351 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
352 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
353 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
355 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
356 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
358 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
360 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
361 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
362 fb_viewportcenter[3] = 0.5f;
363 fb_viewportcenter[0] = 0.0f;
364 fb_viewportscale[1] = 0.5f * viewport[2];
365 fb_viewportscale[2] = -0.5f * viewport[3];
366 fb_viewportscale[3] = 0.5f;
367 fb_viewportscale[0] = 1.0f;
370 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
372 if (dpsoftrast.interlace)
374 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
375 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
376 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
377 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
381 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
382 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
386 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
388 thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
389 thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
390 thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
391 thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
392 thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
395 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
397 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
398 // and viewport projection values
401 x1 = thread->scissor[0];
402 x2 = thread->scissor[0] + thread->scissor[2];
403 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
404 y2 = dpsoftrast.fb_height - thread->scissor[1];
405 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
407 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
409 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
410 thread->fb_scissor[0] = x1;
411 thread->fb_scissor[1] = y1;
412 thread->fb_scissor[2] = x2 - x1;
413 thread->fb_scissor[3] = y2 - y1;
415 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
416 DPSOFTRAST_RecalcClipPlane(thread);
417 DPSOFTRAST_RecalcThread(thread);
420 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
422 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
425 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
427 if (thread->blendsubtract)
429 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
431 #define BLENDFUNC(sfactor, dfactor, blendmode) \
432 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
433 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
434 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
439 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
441 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
442 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
443 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
444 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
445 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
446 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
447 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
448 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
449 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
450 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
451 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
456 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
458 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
460 mask &= thread->validate;
463 if (mask & DPSOFTRAST_VALIDATE_FB)
465 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
466 DPSOFTRAST_RecalcFB(thread);
468 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
470 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
471 DPSOFTRAST_RecalcDepthFunc(thread);
473 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
475 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
476 DPSOFTRAST_RecalcBlendFunc(thread);
480 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
482 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
483 return &dpsoftrast.texture[index];
487 static void DPSOFTRAST_Texture_Grow(void)
489 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
490 DPSOFTRAST_State_Thread *thread;
494 // expand texture array as needed
495 if (dpsoftrast.texture_max < 1024)
496 dpsoftrast.texture_max = 1024;
498 dpsoftrast.texture_max *= 2;
499 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
500 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
501 if (dpsoftrast.texbound[i])
502 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
503 for (j = 0; j < dpsoftrast.numthreads; j++)
505 thread = &dpsoftrast.threads[j];
506 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
507 if (thread->texbound[i])
508 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
512 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
521 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
522 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
523 DPSOFTRAST_Texture *texture;
524 if (width*height*depth < 1)
526 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
529 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
531 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
536 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
537 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
538 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
540 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
541 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
543 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
548 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
551 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
553 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
558 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
560 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
563 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
565 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
568 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
570 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
573 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
575 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
578 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
580 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
583 // find first empty slot in texture array
584 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
585 if (!dpsoftrast.texture[texnum].bytes)
587 dpsoftrast.texture_firstfree = texnum + 1;
588 if (dpsoftrast.texture_max <= texnum)
589 DPSOFTRAST_Texture_Grow();
590 if (dpsoftrast.texture_end <= texnum)
591 dpsoftrast.texture_end = texnum + 1;
592 texture = &dpsoftrast.texture[texnum];
593 memset(texture, 0, sizeof(*texture));
594 texture->flags = flags;
595 texture->width = width;
596 texture->height = height;
597 texture->depth = depth;
598 texture->sides = sides;
610 s = w * h * d * sides * 4;
611 texture->mipmap[mipmaps][0] = size;
612 texture->mipmap[mipmaps][1] = s;
613 texture->mipmap[mipmaps][2] = w;
614 texture->mipmap[mipmaps][3] = h;
615 texture->mipmap[mipmaps][4] = d;
618 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
624 texture->mipmaps = mipmaps;
625 texture->size = size;
627 // allocate the pixels now
628 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
632 void DPSOFTRAST_Texture_Free(int index)
634 DPSOFTRAST_Texture *texture;
635 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
639 MM_FREE(texture->bytes);
640 texture->bytes = NULL;
641 memset(texture, 0, sizeof(*texture));
642 // adjust the free range and used range
643 if (dpsoftrast.texture_firstfree > index)
644 dpsoftrast.texture_firstfree = index;
645 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
646 dpsoftrast.texture_end--;
648 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
650 int i, x, y, z, w, layer0, layer1, row0, row1;
651 unsigned char *o, *i0, *i1, *i2, *i3;
652 DPSOFTRAST_Texture *texture;
653 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
654 if (texture->mipmaps <= 1)
656 for (i = 1;i < texture->mipmaps;i++)
658 for (z = 0;z < texture->mipmap[i][4];z++)
662 if (layer1 >= texture->mipmap[i-1][4])
663 layer1 = texture->mipmap[i-1][4]-1;
664 for (y = 0;y < texture->mipmap[i][3];y++)
668 if (row1 >= texture->mipmap[i-1][3])
669 row1 = texture->mipmap[i-1][3]-1;
670 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
671 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
672 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
673 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
674 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
675 w = texture->mipmap[i][2];
678 if (texture->mipmap[i-1][2] > 1)
680 // average 3D texture
681 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
683 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
684 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
685 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
686 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
691 // average 3D mipmap with parent width == 1
692 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
694 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
695 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
696 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
697 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
703 if (texture->mipmap[i-1][2] > 1)
705 // average 2D texture (common case)
706 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
708 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
709 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
710 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
711 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
716 // 2D texture with parent width == 1
717 o[0] = (i0[0] + i1[0] + 1) >> 1;
718 o[1] = (i0[1] + i1[1] + 1) >> 1;
719 o[2] = (i0[2] + i1[2] + 1) >> 1;
720 o[3] = (i0[3] + i1[3] + 1) >> 1;
727 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
729 DPSOFTRAST_Texture *texture;
731 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
736 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
737 while (blockheight > 0)
739 memcpy(dst, pixels, blockwidth * 4);
740 pixels += blockwidth * 4;
741 dst += texture->mipmap[0][2] * 4;
745 DPSOFTRAST_Texture_CalculateMipmaps(index);
747 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
749 DPSOFTRAST_Texture *texture;
750 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
754 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
755 DPSOFTRAST_Texture_CalculateMipmaps(index);
757 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
759 DPSOFTRAST_Texture *texture;
760 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
761 return texture->mipmap[mip][2];
763 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
765 DPSOFTRAST_Texture *texture;
766 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
767 return texture->mipmap[mip][3];
769 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
771 DPSOFTRAST_Texture *texture;
772 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
773 return texture->mipmap[mip][4];
775 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
777 DPSOFTRAST_Texture *texture;
778 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
781 return texture->bytes + texture->mipmap[mip][0];
783 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
785 DPSOFTRAST_Texture *texture;
786 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
787 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
789 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
794 texture->filter = filter;
797 static void DPSOFTRAST_Draw_FlushThreads(void);
799 static void DPSOFTRAST_Draw_SyncCommands(void)
801 if(dpsoftrast.usethreads) MEMORY_BARRIER;
802 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
805 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
807 DPSOFTRAST_State_Thread *thread;
809 int freecommand = dpsoftrast.commandpool.freecommand;
810 int usedcommands = dpsoftrast.commandpool.usedcommands;
811 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
813 DPSOFTRAST_Draw_SyncCommands();
819 for (i = 0; i < dpsoftrast.numthreads; i++)
821 thread = &dpsoftrast.threads[i];
822 commandoffset = freecommand - thread->commandoffset;
823 if (commandoffset < 0)
824 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
825 if (commandoffset > usedcommands)
828 usedcommands = commandoffset;
831 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
833 thread = &dpsoftrast.threads[waitindex];
834 Thread_LockMutex(thread->drawmutex);
835 if (thread->commandoffset != dpsoftrast.drawcommand)
837 thread->waiting = true;
838 if (thread->starving) Thread_CondSignal(thread->drawcond);
839 Thread_CondWait(thread->waitcond, thread->drawmutex);
840 thread->waiting = false;
842 Thread_UnlockMutex(thread->drawmutex);
844 dpsoftrast.commandpool.usedcommands = usedcommands;
847 #define DPSOFTRAST_ALIGNCOMMAND(size) \
848 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
849 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
850 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
852 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
854 DPSOFTRAST_Command *command;
855 int freecommand = dpsoftrast.commandpool.freecommand;
856 int usedcommands = dpsoftrast.commandpool.usedcommands;
857 int extra = sizeof(DPSOFTRAST_Command);
858 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
859 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
860 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
862 if (dpsoftrast.usethreads)
863 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
865 DPSOFTRAST_Draw_FlushThreads();
866 freecommand = dpsoftrast.commandpool.freecommand;
867 usedcommands = dpsoftrast.commandpool.usedcommands;
869 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
871 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
872 command->opcode = DPSOFTRAST_OPCODE_Reset;
873 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
876 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
877 command->opcode = opcode;
878 command->commandsize = size;
880 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
882 dpsoftrast.commandpool.freecommand = freecommand;
883 dpsoftrast.commandpool.usedcommands = usedcommands + size;
887 static void DPSOFTRAST_UndoCommand(int size)
889 int freecommand = dpsoftrast.commandpool.freecommand;
890 int usedcommands = dpsoftrast.commandpool.usedcommands;
893 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
894 usedcommands -= size;
895 dpsoftrast.commandpool.freecommand = freecommand;
896 dpsoftrast.commandpool.usedcommands = usedcommands;
899 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
900 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
902 thread->viewport[0] = command->x;
903 thread->viewport[1] = command->y;
904 thread->viewport[2] = command->width;
905 thread->viewport[3] = command->height;
906 thread->validate |= DPSOFTRAST_VALIDATE_FB;
908 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
910 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
913 command->width = width;
914 command->height = height;
916 dpsoftrast.viewport[0] = x;
917 dpsoftrast.viewport[1] = y;
918 dpsoftrast.viewport[2] = width;
919 dpsoftrast.viewport[3] = height;
920 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
923 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
924 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
926 int i, x1, y1, x2, y2, w, h, x, y;
927 int miny1, maxy1, miny2, maxy2;
931 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
932 miny1 = thread->miny1;
933 maxy1 = thread->maxy1;
934 miny2 = thread->miny2;
935 maxy2 = thread->maxy2;
936 x1 = thread->fb_scissor[0];
937 y1 = thread->fb_scissor[1];
938 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
939 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
940 if (y1 < miny1) y1 = miny1;
941 if (y2 > maxy2) y2 = maxy2;
946 // FIXME: honor fb_colormask?
947 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
948 for (i = 0;i < 4;i++)
950 if (!dpsoftrast.fb_colorpixels[i])
952 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
955 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
956 for (x = x1;x < x2;x++)
961 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
963 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
970 DEFCOMMAND(3, ClearDepth, float depth;)
971 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
973 int x1, y1, x2, y2, w, h, x, y;
974 int miny1, maxy1, miny2, maxy2;
978 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
979 miny1 = thread->miny1;
980 maxy1 = thread->maxy1;
981 miny2 = thread->miny2;
982 maxy2 = thread->maxy2;
983 x1 = thread->fb_scissor[0];
984 y1 = thread->fb_scissor[1];
985 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
986 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
987 if (y1 < miny1) y1 = miny1;
988 if (y2 > maxy2) y2 = maxy2;
993 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
994 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
997 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
998 for (x = x1;x < x2;x++)
1002 void DPSOFTRAST_ClearDepth(float d)
1004 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1008 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1009 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1011 thread->colormask[0] = command->r != 0;
1012 thread->colormask[1] = command->g != 0;
1013 thread->colormask[2] = command->b != 0;
1014 thread->colormask[3] = command->a != 0;
1015 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1017 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1019 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1026 DEFCOMMAND(5, DepthTest, int enable;)
1027 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1029 thread->depthtest = command->enable;
1030 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1032 void DPSOFTRAST_DepthTest(int enable)
1034 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1035 command->enable = enable;
1038 DEFCOMMAND(6, ScissorTest, int enable;)
1039 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1041 thread->scissortest = command->enable;
1042 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1044 void DPSOFTRAST_ScissorTest(int enable)
1046 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1047 command->enable = enable;
1050 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1051 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1053 thread->scissor[0] = command->x;
1054 thread->scissor[1] = command->y;
1055 thread->scissor[2] = command->width;
1056 thread->scissor[3] = command->height;
1057 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1059 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1061 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1064 command->width = width;
1065 command->height = height;
1068 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1069 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1071 thread->blendfunc[0] = command->sfactor;
1072 thread->blendfunc[1] = command->dfactor;
1073 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1075 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1077 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1078 command->sfactor = sfactor;
1079 command->dfactor = dfactor;
1082 DEFCOMMAND(9, BlendSubtract, int enable;)
1083 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1085 thread->blendsubtract = command->enable;
1086 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1088 void DPSOFTRAST_BlendSubtract(int enable)
1090 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1091 command->enable = enable;
1094 DEFCOMMAND(10, DepthMask, int enable;)
1095 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1097 thread->depthmask = command->enable;
1099 void DPSOFTRAST_DepthMask(int enable)
1101 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1102 command->enable = enable;
1105 DEFCOMMAND(11, DepthFunc, int func;)
1106 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1108 thread->depthfunc = command->func;
1110 void DPSOFTRAST_DepthFunc(int func)
1112 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1113 command->func = func;
1116 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1117 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1119 thread->depthrange[0] = command->nearval;
1120 thread->depthrange[1] = command->farval;
1122 void DPSOFTRAST_DepthRange(float nearval, float farval)
1124 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1125 command->nearval = nearval;
1126 command->farval = farval;
1129 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1130 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1132 thread->polygonoffset[0] = command->alongnormal;
1133 thread->polygonoffset[1] = command->intoview;
1135 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1137 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1138 command->alongnormal = alongnormal;
1139 command->intoview = intoview;
1142 DEFCOMMAND(14, CullFace, int mode;)
1143 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1145 thread->cullface = command->mode;
1147 void DPSOFTRAST_CullFace(int mode)
1149 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1150 command->mode = mode;
1153 DEFCOMMAND(15, AlphaTest, int enable;)
1154 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1156 thread->alphatest = command->enable;
1158 void DPSOFTRAST_AlphaTest(int enable)
1160 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1161 command->enable = enable;
1164 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1165 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1167 thread->alphafunc = command->func;
1168 thread->alphavalue = command->ref;
1170 void DPSOFTRAST_AlphaFunc(int func, float ref)
1172 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1173 command->func = func;
1177 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1179 dpsoftrast.color[0] = r;
1180 dpsoftrast.color[1] = g;
1181 dpsoftrast.color[2] = b;
1182 dpsoftrast.color[3] = a;
1185 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1187 int outstride = blockwidth * 4;
1188 int instride = dpsoftrast.fb_width * 4;
1191 int bx2 = blockx + blockwidth;
1192 int by2 = blocky + blockheight;
1196 unsigned char *inpixels;
1200 if (bx1 < 0) bx1 = 0;
1201 if (by1 < 0) by1 = 0;
1202 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1203 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1205 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1206 if (dpsoftrast.bigendian)
1208 for (y = by1;y < by2;y++)
1210 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1211 o = (unsigned char *)outpixels + (y - by1) * outstride;
1212 for (x = bx1;x < bx2;x++)
1225 for (y = by1;y < by2;y++)
1227 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1228 o = (unsigned char *)outpixels + (y - by1) * outstride;
1234 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1238 int tx2 = tx + width;
1239 int ty2 = ty + height;
1242 int sx2 = sx + width;
1243 int sy2 = sy + height;
1253 unsigned int *spixels;
1254 unsigned int *tpixels;
1255 DPSOFTRAST_Texture *texture;
1256 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1257 if (mip < 0 || mip >= texture->mipmaps) return;
1259 spixels = dpsoftrast.fb_colorpixels[0];
1260 swidth = dpsoftrast.fb_width;
1261 sheight = dpsoftrast.fb_height;
1262 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1263 twidth = texture->mipmap[mip][2];
1264 theight = texture->mipmap[mip][3];
1265 if (tx1 < 0) tx1 = 0;
1266 if (ty1 < 0) ty1 = 0;
1267 if (tx2 > twidth) tx2 = twidth;
1268 if (ty2 > theight) ty2 = theight;
1269 if (sx1 < 0) sx1 = 0;
1270 if (sy1 < 0) sy1 = 0;
1271 if (sx2 > swidth) sx2 = swidth;
1272 if (sy2 > sheight) sy2 = sheight;
1277 if (tw > sw) tw = sw;
1278 if (th > sh) th = sh;
1279 if (tw < 1 || th < 1)
1281 sy1 = sheight - 1 - sy1;
1282 for (y = 0;y < th;y++)
1283 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1284 if (texture->mipmaps > 1)
1285 DPSOFTRAST_Texture_CalculateMipmaps(index);
1288 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1289 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1291 if (thread->texbound[command->unitnum])
1292 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1293 thread->texbound[command->unitnum] = command->texture;
1295 void DPSOFTRAST_SetTexture(int unitnum, int index)
1297 DPSOFTRAST_Command_SetTexture *command;
1298 DPSOFTRAST_Texture *texture;
1299 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1301 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1304 texture = DPSOFTRAST_Texture_GetByIndex(index);
1305 if (index && !texture)
1307 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1311 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1312 command->unitnum = unitnum;
1313 command->texture = texture;
1315 dpsoftrast.texbound[unitnum] = texture;
1316 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1319 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1321 dpsoftrast.pointer_vertex3f = vertex3f;
1322 dpsoftrast.stride_vertex = stride;
1324 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1326 dpsoftrast.pointer_color4f = color4f;
1327 dpsoftrast.pointer_color4ub = NULL;
1328 dpsoftrast.stride_color = stride;
1330 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1332 dpsoftrast.pointer_color4f = NULL;
1333 dpsoftrast.pointer_color4ub = color4ub;
1334 dpsoftrast.stride_color = stride;
1336 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1338 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1339 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1340 dpsoftrast.stride_texcoord[unitnum] = stride;
1343 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1344 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1346 thread->shader_mode = command->mode;
1347 thread->shader_permutation = command->permutation;
1348 thread->shader_exactspecularmath = command->exactspecularmath;
1350 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1352 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1353 command->mode = mode;
1354 command->permutation = permutation;
1355 command->exactspecularmath = exactspecularmath;
1357 dpsoftrast.shader_mode = mode;
1358 dpsoftrast.shader_permutation = permutation;
1359 dpsoftrast.shader_exactspecularmath = exactspecularmath;
1362 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1363 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1365 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1367 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1369 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1370 command->index = index;
1371 command->val[0] = v0;
1372 command->val[1] = v1;
1373 command->val[2] = v2;
1374 command->val[3] = v3;
1376 dpsoftrast.uniform4f[index*4+0] = v0;
1377 dpsoftrast.uniform4f[index*4+1] = v1;
1378 dpsoftrast.uniform4f[index*4+2] = v2;
1379 dpsoftrast.uniform4f[index*4+3] = v3;
1381 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1383 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1384 command->index = index;
1385 memcpy(command->val, v, sizeof(command->val));
1387 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1390 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1391 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1393 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1395 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1399 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1401 __m128 m0, m1, m2, m3;
1402 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1403 command->index = (DPSOFTRAST_UNIFORM)index;
1404 if (((size_t)v)&(ALIGN_SIZE-1))
1406 m0 = _mm_loadu_ps(v);
1407 m1 = _mm_loadu_ps(v+4);
1408 m2 = _mm_loadu_ps(v+8);
1409 m3 = _mm_loadu_ps(v+12);
1413 m0 = _mm_load_ps(v);
1414 m1 = _mm_load_ps(v+4);
1415 m2 = _mm_load_ps(v+8);
1416 m3 = _mm_load_ps(v+12);
1420 __m128 t0, t1, t2, t3;
1421 t0 = _mm_unpacklo_ps(m0, m1);
1422 t1 = _mm_unpacklo_ps(m2, m3);
1423 t2 = _mm_unpackhi_ps(m0, m1);
1424 t3 = _mm_unpackhi_ps(m2, m3);
1425 m0 = _mm_movelh_ps(t0, t1);
1426 m1 = _mm_movehl_ps(t1, t0);
1427 m2 = _mm_movelh_ps(t2, t3);
1428 m3 = _mm_movehl_ps(t3, t2);
1430 _mm_store_ps(command->val, m0);
1431 _mm_store_ps(command->val+4, m1);
1432 _mm_store_ps(command->val+8, m2);
1433 _mm_store_ps(command->val+12, m3);
1434 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1435 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1436 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1437 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1442 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1443 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1445 thread->uniform1i[command->index] = command->val;
1447 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1449 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1450 command->index = index;
1453 dpsoftrast.uniform1i[command->index] = i0;
1456 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1457 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1459 memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1460 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1462 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1464 DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1465 command->clipplane[0] = x;
1466 command->clipplane[1] = y;
1467 command->clipplane[2] = z;
1468 command->clipplane[3] = w;
1472 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1474 float *end = dst + size*4;
1475 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1479 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1488 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1495 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1497 float *end = dst + size*4;
1498 if (stride == sizeof(float[3]))
1500 float *end4 = dst + (size&~3)*4;
1501 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1505 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1506 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1507 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1508 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1509 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1510 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1511 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1512 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1513 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1514 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1515 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1516 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1517 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1519 src += 4*sizeof(float[3]);
1526 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1527 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1528 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1529 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1530 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1531 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1532 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1533 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1534 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1535 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1536 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1537 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1538 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1540 src += 4*sizeof(float[3]);
1544 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1548 __m128 v = _mm_loadu_ps((const float *)src);
1549 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1550 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1551 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1552 _mm_store_ps(dst, v);
1561 __m128 v = _mm_load_ps((const float *)src);
1562 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1563 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1564 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1565 _mm_store_ps(dst, v);
1572 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1574 float *end = dst + size*4;
1575 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1576 if (stride == sizeof(float[2]))
1578 float *end2 = dst + (size&~1)*4;
1579 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1583 __m128 v = _mm_loadu_ps((const float *)src);
1584 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1585 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1587 src += 2*sizeof(float[2]);
1594 __m128 v = _mm_load_ps((const float *)src);
1595 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1596 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1598 src += 2*sizeof(float[2]);
1604 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1610 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1612 float *end = dst + size*4;
1613 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1614 if (stride == sizeof(unsigned char[4]))
1616 float *end4 = dst + (size&~3)*4;
1617 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1621 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1622 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1623 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1624 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1625 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1627 src += 4*sizeof(unsigned char[4]);
1634 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1635 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1636 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1637 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1638 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1640 src += 4*sizeof(unsigned char[4]);
1646 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1647 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1653 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1655 float *end = dst + 4*size;
1656 __m128 v = _mm_loadu_ps(src);
1659 _mm_store_ps(dst, v);
1665 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1668 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1669 __m128 m0, m1, m2, m3;
1671 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1673 // fast case for identity matrix
1674 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1677 end = out4f + numitems*4;
1678 m0 = _mm_loadu_ps(inmatrix16f);
1679 m1 = _mm_loadu_ps(inmatrix16f + 4);
1680 m2 = _mm_loadu_ps(inmatrix16f + 8);
1681 m3 = _mm_loadu_ps(inmatrix16f + 12);
1682 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1686 __m128 v = _mm_loadu_ps(in4f);
1688 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1689 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1690 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1691 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1700 __m128 v = _mm_load_ps(in4f);
1702 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1703 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1704 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1705 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1713 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1715 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1719 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1721 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1722 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1723 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1724 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1727 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1729 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1730 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1731 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1732 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1735 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1738 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1739 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1740 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1741 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1744 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1746 int clipmask = 0xFF;
1747 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1748 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1749 __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1750 __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1751 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1752 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1753 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1754 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1755 #define BBFRONT(k, pos) \
1757 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1758 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1759 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1762 clipmask &= ~(1<<k); \
1763 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1764 minproj = _mm_min_ss(minproj, proj); \
1765 maxproj = _mm_max_ss(maxproj, proj); \
1769 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1770 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1771 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1772 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1773 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1774 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1778 if (clipmask&(1<<k)) \
1780 if (!(clipmask&(1<<(k^1)))) \
1782 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1783 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1784 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1785 minproj = _mm_min_ss(minproj, proj); \
1786 maxproj = _mm_max_ss(maxproj, proj); \
1788 if (!(clipmask&(1<<(k^2)))) \
1790 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1791 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1792 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1793 minproj = _mm_min_ss(minproj, proj); \
1794 maxproj = _mm_max_ss(maxproj, proj); \
1796 if (!(clipmask&(1<<(k^4)))) \
1798 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1799 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1800 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1801 minproj = _mm_min_ss(minproj, proj); \
1802 maxproj = _mm_max_ss(maxproj, proj); \
1806 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1807 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1808 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1809 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1810 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1811 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1812 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1813 *starty = _mm_cvttss_si32(maxproj);
1814 *endy = _mm_cvttss_si32(minproj)+1;
1818 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1820 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1821 float *end = out4f + numitems*4;
1822 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1823 __m128 minpos, maxpos;
1824 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1826 minpos = maxpos = _mm_loadu_ps(in4f);
1829 __m128 v = _mm_loadu_ps(in4f);
1830 minpos = _mm_min_ps(minpos, v);
1831 maxpos = _mm_max_ps(maxpos, v);
1832 _mm_store_ps(out4f, v);
1833 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1834 _mm_store_ps(screen4f, v);
1842 minpos = maxpos = _mm_load_ps(in4f);
1845 __m128 v = _mm_load_ps(in4f);
1846 minpos = _mm_min_ps(minpos, v);
1847 maxpos = _mm_max_ps(maxpos, v);
1848 _mm_store_ps(out4f, v);
1849 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1850 _mm_store_ps(screen4f, v);
1858 ALIGN(float minposf[4]);
1859 ALIGN(float maxposf[4]);
1860 _mm_store_ps(minposf, minpos);
1861 _mm_store_ps(maxposf, maxpos);
1862 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1867 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1869 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1870 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1872 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1873 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1874 end = out4f + numitems*4;
1875 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1876 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1877 m0 = _mm_loadu_ps(inmatrix16f);
1878 m1 = _mm_loadu_ps(inmatrix16f + 4);
1879 m2 = _mm_loadu_ps(inmatrix16f + 8);
1880 m3 = _mm_loadu_ps(inmatrix16f + 12);
1881 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1883 minpos = maxpos = _mm_loadu_ps(in4f);
1886 __m128 v = _mm_loadu_ps(in4f);
1887 minpos = _mm_min_ps(minpos, v);
1888 maxpos = _mm_max_ps(maxpos, v);
1889 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1890 _mm_store_ps(out4f, v);
1891 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1892 _mm_store_ps(screen4f, v);
1900 minpos = maxpos = _mm_load_ps(in4f);
1903 __m128 v = _mm_load_ps(in4f);
1904 minpos = _mm_min_ps(minpos, v);
1905 maxpos = _mm_max_ps(maxpos, v);
1906 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1907 _mm_store_ps(out4f, v);
1908 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1909 _mm_store_ps(screen4f, v);
1917 ALIGN(float minposf[4]);
1918 ALIGN(float maxposf[4]);
1919 _mm_store_ps(minposf, minpos);
1920 _mm_store_ps(maxposf, maxpos);
1921 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f);
1927 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1930 float *outf = dpsoftrast.post_array4f[outarray];
1931 const unsigned char *inb;
1932 int firstvertex = dpsoftrast.firstvertex;
1933 int numvertices = dpsoftrast.numvertices;
1937 case DPSOFTRAST_ARRAY_POSITION:
1938 stride = dpsoftrast.stride_vertex;
1939 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1940 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1942 case DPSOFTRAST_ARRAY_COLOR:
1943 stride = dpsoftrast.stride_color;
1944 if (dpsoftrast.pointer_color4f)
1946 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1947 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1949 else if (dpsoftrast.pointer_color4ub)
1951 stride = dpsoftrast.stride_color;
1952 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1953 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1957 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1961 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1962 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1964 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1965 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1968 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1971 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1974 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1986 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1988 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1989 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1994 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1997 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1998 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
2006 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2009 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2010 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2017 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2020 int startx = span->startx;
2021 int endx = span->endx;
2022 float wslope = triangle->w[0];
2023 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2024 float endz = 1.0f / (w + wslope * startx);
2025 if (triangle->w[0] == 0)
2027 // LordHavoc: fast flat polygons (HUD/menu)
2028 for (x = startx;x < endx;x++)
2032 for (x = startx;x < endx;)
2034 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2036 if (nextsub >= endx) nextsub = endsub = endx-1;
2037 endz = 1.0f / (w + wslope * nextsub);
2038 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2039 for (; x <= endsub; x++, z += dz)
2044 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2048 int startx = span->startx;
2049 int endx = span->endx;
2051 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2052 unsigned char * RESTRICT pixelmask = span->pixelmask;
2053 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2054 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2057 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2058 pixeli += span->y * dpsoftrast.fb_width + span->x;
2059 // handle alphatest now (this affects depth writes too)
2060 if (thread->alphatest)
2061 for (x = startx;x < endx;x++)
2062 if (in4ub[x*4+3] < 128)
2063 pixelmask[x] = false;
2064 // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2065 // helps sprites, text and hud artwork
2066 switch(thread->fb_blendmode)
2068 case DPSOFTRAST_BLENDMODE_ALPHA:
2069 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2070 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2071 for (x = startx;x < endx;x++)
2072 if (in4ub[x*4+3] < 1)
2073 pixelmask[x] = false;
2075 case DPSOFTRAST_BLENDMODE_OPAQUE:
2076 case DPSOFTRAST_BLENDMODE_ADD:
2077 case DPSOFTRAST_BLENDMODE_INVMOD:
2078 case DPSOFTRAST_BLENDMODE_MUL:
2079 case DPSOFTRAST_BLENDMODE_MUL2:
2080 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2081 case DPSOFTRAST_BLENDMODE_INVADD:
2084 // put some special values at the end of the mask to ensure the loops end
2085 pixelmask[endx] = 1;
2086 pixelmask[endx+1] = 0;
2087 // LordHavoc: use a double loop to identify subspans, this helps the
2088 // optimized copy/blend loops to perform at their best, most triangles
2089 // have only one run of pixels, and do the search using wide reads...
2093 // if this pixel is masked off, it's probably not alone...
2100 // the 4-item search must be aligned or else it stalls badly
2101 if ((x & 3) && !pixelmask[x])
2104 if ((x & 3) && !pixelmask[x])
2107 if ((x & 3) && !pixelmask[x]) x++;
2110 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2114 for (;!pixelmask[x];x++)
2116 // rather than continue the loop, just check the end variable
2120 // find length of subspan
2125 if ((subx & 3) && pixelmask[subx])
2128 if ((subx & 3) && pixelmask[subx])
2131 if ((subx & 3) && pixelmask[subx]) subx++;
2134 while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2138 for (;pixelmask[subx];subx++)
2140 // the checks can overshoot, so make sure to clip it...
2143 // now that we know the subspan length... process!
2144 switch(thread->fb_blendmode)
2146 case DPSOFTRAST_BLENDMODE_OPAQUE:
2150 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2155 while (x + 16 <= subx)
2157 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2158 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2159 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2160 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2165 while (x + 4 <= subx)
2167 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2173 pixeli[x+1] = ini[x+1];
2183 case DPSOFTRAST_BLENDMODE_ALPHA:
2184 #define FINISHBLEND(blend2, blend1) \
2185 for (;x + 1 < subx;x += 2) \
2188 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2189 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2191 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2196 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2197 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2199 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2203 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2204 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2206 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2207 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2210 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2212 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2213 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2215 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2216 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2219 case DPSOFTRAST_BLENDMODE_ADD:
2220 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2222 case DPSOFTRAST_BLENDMODE_INVMOD:
2224 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2226 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2229 case DPSOFTRAST_BLENDMODE_MUL:
2230 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2232 case DPSOFTRAST_BLENDMODE_MUL2:
2233 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2235 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2237 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2238 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2240 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2241 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2244 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2246 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2247 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2249 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2250 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2253 case DPSOFTRAST_BLENDMODE_INVADD:
2255 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2257 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2265 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2268 int startx = span->startx;
2269 int endx = span->endx;
2274 float tc[2], endtc[2];
2276 unsigned int tci[2];
2277 unsigned int tci1[2];
2278 unsigned int tcimin[2];
2279 unsigned int tcimax[2];
2284 const unsigned char * RESTRICT pixelbase;
2285 const unsigned char * RESTRICT pixel[4];
2286 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2287 // if no texture is bound, just fill it with white
2290 for (x = startx;x < endx;x++)
2292 out4f[x*4+0] = 1.0f;
2293 out4f[x*4+1] = 1.0f;
2294 out4f[x*4+2] = 1.0f;
2295 out4f[x*4+3] = 1.0f;
2299 mip = triangle->mip[texunitindex];
2300 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2301 // if this mipmap of the texture is 1 pixel, just fill it with that color
2302 if (texture->mipmap[mip][1] == 4)
2304 c[0] = texture->bytes[2] * (1.0f/255.0f);
2305 c[1] = texture->bytes[1] * (1.0f/255.0f);
2306 c[2] = texture->bytes[0] * (1.0f/255.0f);
2307 c[3] = texture->bytes[3] * (1.0f/255.0f);
2308 for (x = startx;x < endx;x++)
2310 out4f[x*4+0] = c[0];
2311 out4f[x*4+1] = c[1];
2312 out4f[x*4+2] = c[2];
2313 out4f[x*4+3] = c[3];
2317 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2318 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2319 flags = texture->flags;
2320 tcscale[0] = texture->mipmap[mip][2];
2321 tcscale[1] = texture->mipmap[mip][3];
2322 tciwidth = texture->mipmap[mip][2];
2325 tcimax[0] = texture->mipmap[mip][2]-1;
2326 tcimax[1] = texture->mipmap[mip][3]-1;
2327 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2328 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2329 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2330 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2336 for (x = startx;x < endx;)
2338 unsigned int subtc[2];
2339 unsigned int substep[2];
2340 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2341 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2342 if (nextsub >= endx)
2344 nextsub = endsub = endx-1;
2345 if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2349 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2350 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2356 substep[0] = (endtc[0] - tc[0]) * subscale;
2357 substep[1] = (endtc[1] - tc[1]) * subscale;
2358 subtc[0] = tc[0] * (1<<12);
2359 subtc[1] = tc[1] * (1<<12);
2362 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2364 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2366 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2367 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2368 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2369 tci[0] = subtc[0]>>12;
2370 tci[1] = subtc[1]>>12;
2371 tci1[0] = tci[0] + 1;
2372 tci1[1] = tci[1] + 1;
2373 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2374 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2375 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2376 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2377 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2378 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2379 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2380 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2381 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2382 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2383 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2384 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2385 out4f[x*4+0] = c[0];
2386 out4f[x*4+1] = c[1];
2387 out4f[x*4+2] = c[2];
2388 out4f[x*4+3] = c[3];
2393 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2395 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2396 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2397 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2398 tci[0] = subtc[0]>>12;
2399 tci[1] = subtc[1]>>12;
2400 tci1[0] = tci[0] + 1;
2401 tci1[1] = tci[1] + 1;
2402 tci[0] &= tciwrapmask[0];
2403 tci[1] &= tciwrapmask[1];
2404 tci1[0] &= tciwrapmask[0];
2405 tci1[1] &= tciwrapmask[1];
2406 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2407 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2408 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2409 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2410 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2411 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2412 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2413 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2414 out4f[x*4+0] = c[0];
2415 out4f[x*4+1] = c[1];
2416 out4f[x*4+2] = c[2];
2417 out4f[x*4+3] = c[3];
2421 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2423 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2425 tci[0] = subtc[0]>>12;
2426 tci[1] = subtc[1]>>12;
2427 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2428 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2429 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2430 c[0] = pixel[0][2] * (1.0f / 255.0f);
2431 c[1] = pixel[0][1] * (1.0f / 255.0f);
2432 c[2] = pixel[0][0] * (1.0f / 255.0f);
2433 c[3] = pixel[0][3] * (1.0f / 255.0f);
2434 out4f[x*4+0] = c[0];
2435 out4f[x*4+1] = c[1];
2436 out4f[x*4+2] = c[2];
2437 out4f[x*4+3] = c[3];
2442 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2444 tci[0] = subtc[0]>>12;
2445 tci[1] = subtc[1]>>12;
2446 tci[0] &= tciwrapmask[0];
2447 tci[1] &= tciwrapmask[1];
2448 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2449 c[0] = pixel[0][2] * (1.0f / 255.0f);
2450 c[1] = pixel[0][1] * (1.0f / 255.0f);
2451 c[2] = pixel[0][0] * (1.0f / 255.0f);
2452 c[3] = pixel[0][3] * (1.0f / 255.0f);
2453 out4f[x*4+0] = c[0];
2454 out4f[x*4+1] = c[1];
2455 out4f[x*4+2] = c[2];
2456 out4f[x*4+3] = c[3];
2462 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2466 int startx = span->startx;
2467 int endx = span->endx;
2469 __m128 data, slope, tcscale;
2470 __m128i tcsize, tcmask, tcoffset, tcmax;
2472 __m128i subtc, substep, endsubtc;
2475 int affine; // LordHavoc: optimized affine texturing case
2476 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2477 const unsigned char * RESTRICT pixelbase;
2478 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2479 // if no texture is bound, just fill it with white
2482 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2485 mip = triangle->mip[texunitindex];
2486 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2487 // if this mipmap of the texture is 1 pixel, just fill it with that color
2488 if (texture->mipmap[mip][1] == 4)
2490 unsigned int k = *((const unsigned int *)pixelbase);
2491 for (x = startx;x < endx;x++)
2495 affine = zf[startx] == zf[endx-1];
2496 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2497 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2498 flags = texture->flags;
2499 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2500 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2501 tcscale = _mm_cvtepi32_ps(tcsize);
2502 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2503 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2504 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2506 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2507 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2508 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2509 tcmax = _mm_packs_epi32(tcmask, tcmask);
2510 for (x = startx;x < endx;)
2512 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2513 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2514 if (nextsub >= endx || affine)
2516 nextsub = endsub = endx-1;
2517 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2521 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2523 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2524 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2525 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2526 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2527 substep = _mm_slli_epi32(substep, 1);
2530 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2531 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2533 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2534 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2536 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2537 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2538 tci = _mm_madd_epi16(tci, tcoffset);
2539 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2540 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2541 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2542 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2543 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2544 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2545 fracm = _mm_srli_epi16(subtc, 1);
2546 pix1 = _mm_add_epi16(pix1,
2547 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2548 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2549 pix3 = _mm_add_epi16(pix3,
2550 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2551 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2552 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2553 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2554 pix2 = _mm_add_epi16(pix2,
2555 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2556 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2557 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2561 const unsigned char * RESTRICT ptr1;
2562 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2563 tci = _mm_madd_epi16(tci, tcoffset);
2564 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2565 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2566 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2567 fracm = _mm_srli_epi16(subtc, 1);
2568 pix1 = _mm_add_epi16(pix1,
2569 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2570 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2571 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2572 pix1 = _mm_add_epi16(pix1,
2573 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2574 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2575 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2579 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2581 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2583 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2584 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2585 tci = _mm_madd_epi16(tci, tcoffset);
2586 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2587 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2588 _mm_setzero_si128());
2589 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2590 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2591 _mm_setzero_si128());
2592 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2593 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2594 tci = _mm_madd_epi16(tci, tcoffset);
2595 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2596 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2597 _mm_setzero_si128());
2598 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2599 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2600 _mm_setzero_si128());
2601 fracm = _mm_srli_epi16(subtc, 1);
2602 pix1 = _mm_add_epi16(pix1,
2603 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2604 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2605 pix3 = _mm_add_epi16(pix3,
2606 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2607 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2608 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2609 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2610 pix2 = _mm_add_epi16(pix2,
2611 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2612 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2613 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2617 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2618 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2619 tci = _mm_madd_epi16(tci, tcoffset);
2620 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2621 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2622 _mm_setzero_si128());
2623 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2624 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2625 _mm_setzero_si128());
2626 fracm = _mm_srli_epi16(subtc, 1);
2627 pix1 = _mm_add_epi16(pix1,
2628 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2629 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2630 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2631 pix1 = _mm_add_epi16(pix1,
2632 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2633 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2634 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2640 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2642 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2643 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2644 tci = _mm_madd_epi16(tci, tcoffset);
2645 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2646 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2647 _mm_setzero_si128());
2648 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2649 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2650 _mm_setzero_si128());
2651 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2652 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2653 tci = _mm_madd_epi16(tci, tcoffset);
2654 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2655 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2656 _mm_setzero_si128());
2657 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2658 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2659 _mm_setzero_si128());
2660 fracm = _mm_srli_epi16(subtc, 1);
2661 pix1 = _mm_add_epi16(pix1,
2662 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2663 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2664 pix3 = _mm_add_epi16(pix3,
2665 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2666 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2667 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2668 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2669 pix2 = _mm_add_epi16(pix2,
2670 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2671 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2672 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2676 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2677 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2678 tci = _mm_madd_epi16(tci, tcoffset);
2679 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2680 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2681 _mm_setzero_si128());
2682 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2683 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2684 _mm_setzero_si128());
2685 fracm = _mm_srli_epi16(subtc, 1);
2686 pix1 = _mm_add_epi16(pix1,
2687 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2688 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2689 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2690 pix1 = _mm_add_epi16(pix1,
2691 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2692 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2693 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2700 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2702 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2704 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2705 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2706 tci = _mm_madd_epi16(tci, tcoffset);
2707 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2708 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2712 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2713 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2714 tci = _mm_madd_epi16(tci, tcoffset);
2715 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2721 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2723 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2724 tci = _mm_and_si128(tci, tcmax);
2725 tci = _mm_madd_epi16(tci, tcoffset);
2726 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2727 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2731 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2732 tci = _mm_and_si128(tci, tcmax);
2733 tci = _mm_madd_epi16(tci, tcoffset);
2734 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2743 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2746 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2749 float DPSOFTRAST_SampleShadowmap(const float *vector)
2755 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2758 int startx = span->startx;
2759 int endx = span->endx;
2764 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2765 for (x = startx;x < endx;x++)
2768 c[0] = (data[0] + slope[0]*x) * z;
2769 c[1] = (data[1] + slope[1]*x) * z;
2770 c[2] = (data[2] + slope[2]*x) * z;
2771 c[3] = (data[3] + slope[3]*x) * z;
2772 out4f[x*4+0] = in4f[x*4+0] * c[0];
2773 out4f[x*4+1] = in4f[x*4+1] * c[1];
2774 out4f[x*4+2] = in4f[x*4+2] * c[2];
2775 out4f[x*4+3] = in4f[x*4+3] * c[3];
2779 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2782 int startx = span->startx;
2783 int endx = span->endx;
2788 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2789 for (x = startx;x < endx;x++)
2792 c[0] = (data[0] + slope[0]*x) * z;
2793 c[1] = (data[1] + slope[1]*x) * z;
2794 c[2] = (data[2] + slope[2]*x) * z;
2795 c[3] = (data[3] + slope[3]*x) * z;
2796 out4f[x*4+0] = c[0];
2797 out4f[x*4+1] = c[1];
2798 out4f[x*4+2] = c[2];
2799 out4f[x*4+3] = c[3];
2803 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2805 int x, startx = span->startx, endx = span->endx;
2806 float c[4], localcolor[4];
2807 localcolor[0] = subcolor[0];
2808 localcolor[1] = subcolor[1];
2809 localcolor[2] = subcolor[2];
2810 localcolor[3] = subcolor[3];
2811 for (x = startx;x < endx;x++)
2813 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2814 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2815 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2816 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2817 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2818 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2819 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2820 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2824 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2826 int x, startx = span->startx, endx = span->endx;
2827 for (x = startx;x < endx;x++)
2829 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2830 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2831 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2832 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2836 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2838 int x, startx = span->startx, endx = span->endx;
2839 for (x = startx;x < endx;x++)
2841 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2842 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2843 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2844 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2848 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2850 int x, startx = span->startx, endx = span->endx;
2852 for (x = startx;x < endx;x++)
2854 a = 1.0f - inb4f[x*4+3];
2856 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2857 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2858 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2859 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2863 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2865 int x, startx = span->startx, endx = span->endx;
2866 float localcolor[4], ilerp, lerp;
2867 localcolor[0] = color[0];
2868 localcolor[1] = color[1];
2869 localcolor[2] = color[2];
2870 localcolor[3] = color[3];
2871 ilerp = 1.0f - localcolor[3];
2872 lerp = localcolor[3];
2873 for (x = startx;x < endx;x++)
2875 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2876 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2877 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2878 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2884 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2888 int startx = span->startx;
2889 int endx = span->endx;
2892 __m128i submod, substep, endsubmod;
2893 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2894 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2895 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2896 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2897 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2898 for (x = startx; x < endx;)
2900 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2901 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2902 if (nextsub >= endx)
2904 nextsub = endsub = endx-1;
2905 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2909 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2910 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2911 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2912 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2913 substep = _mm_packs_epi32(substep, substep);
2914 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2916 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2917 pix = _mm_mulhi_epu16(pix, submod);
2918 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2922 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2923 pix = _mm_mulhi_epu16(pix, submod);
2924 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2931 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2935 int startx = span->startx;
2936 int endx = span->endx;
2939 __m128i submod, substep, endsubmod;
2940 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2941 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2942 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2943 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2944 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2945 for (x = startx; x < endx;)
2947 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2948 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2949 if (nextsub >= endx)
2951 nextsub = endsub = endx-1;
2952 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2956 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2957 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2958 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2959 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2960 substep = _mm_packs_epi32(substep, substep);
2961 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2963 __m128i pix = _mm_srai_epi16(submod, 4);
2964 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2968 __m128i pix = _mm_srai_epi16(submod, 4);
2969 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2976 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2979 int x, startx = span->startx, endx = span->endx;
2980 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2981 localcolor = _mm_packs_epi32(localcolor, localcolor);
2982 for (x = startx;x+2 <= endx;x+=2)
2984 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2985 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2986 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
2987 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2991 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2992 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2993 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
2994 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2999 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3002 int x, startx = span->startx, endx = span->endx;
3003 for (x = startx;x+2 <= endx;x+=2)
3005 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3006 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3007 pix1 = _mm_mulhi_epu16(pix1, pix2);
3008 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3012 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3013 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3014 pix1 = _mm_mulhi_epu16(pix1, pix2);
3015 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3020 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3023 int x, startx = span->startx, endx = span->endx;
3024 for (x = startx;x+2 <= endx;x+=2)
3026 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3027 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3028 pix1 = _mm_add_epi16(pix1, pix2);
3029 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3033 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3034 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3035 pix1 = _mm_add_epi16(pix1, pix2);
3036 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3041 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3044 int x, startx = span->startx, endx = span->endx;
3045 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3046 tint = _mm_packs_epi32(tint, tint);
3047 for (x = startx;x+2 <= endx;x+=2)
3049 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3050 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3051 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3052 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3056 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3057 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3058 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3059 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3064 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3067 int x, startx = span->startx, endx = span->endx;
3068 for (x = startx;x+2 <= endx;x+=2)
3070 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3071 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3072 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3073 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3074 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3078 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3079 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3080 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3081 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3082 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3087 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3090 int x, startx = span->startx, endx = span->endx;
3091 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3092 localcolor = _mm_packs_epi32(localcolor, localcolor);
3093 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3094 for (x = startx;x+2 <= endx;x+=2)
3096 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3097 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3098 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3102 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3103 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3104 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3111 void DPSOFTRAST_VertexShader_Generic(void)
3113 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3114 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3115 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3116 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3117 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3120 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3122 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3123 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3124 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3125 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3126 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3127 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3129 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3130 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3131 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3133 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3134 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3137 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3139 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3142 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3144 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3147 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3152 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3153 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3158 void DPSOFTRAST_VertexShader_PostProcess(void)
3160 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3161 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3162 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3165 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3167 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3168 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3169 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3170 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3171 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3172 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3173 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3175 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3176 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3178 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3179 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3181 // TODO: implement saturation
3183 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3185 // TODO: implement gammaramps
3187 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3192 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3194 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3197 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3199 // this is never called (because colormask is off when this shader is used)
3200 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3201 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3202 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3203 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3204 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3209 void DPSOFTRAST_VertexShader_FlatColor(void)
3211 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3212 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3215 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3218 unsigned char * RESTRICT pixelmask = span->pixelmask;
3219 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3220 int x, startx = span->startx, endx = span->endx;
3221 __m128i Color_Ambientm;
3222 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3223 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3224 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3225 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3226 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3227 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3228 pixel = buffer_FragColorbgra8;
3229 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3230 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3231 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3232 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3233 for (x = startx;x < endx;x++)
3236 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3239 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3240 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3241 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3242 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3248 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3249 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3250 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3252 if (pixel == buffer_FragColorbgra8)
3253 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3259 void DPSOFTRAST_VertexShader_VertexColor(void)
3261 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3262 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3263 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3266 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3269 unsigned char * RESTRICT pixelmask = span->pixelmask;
3270 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3271 int x, startx = span->startx, endx = span->endx;
3272 __m128i Color_Ambientm, Color_Diffusem;
3274 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3275 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3276 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3277 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3278 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3279 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3280 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3281 pixel = buffer_FragColorbgra8;
3282 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3283 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3284 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3285 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3286 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3287 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3288 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3289 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3290 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3291 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3292 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3293 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3294 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3295 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3297 __m128i color, mod, pix;
3298 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3301 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3302 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3303 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3304 data = _mm_add_ps(data, slope);
3305 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3306 data = _mm_add_ps(data, slope);
3307 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3308 data = _mm_add_ps(data, slope);
3309 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3310 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3311 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3312 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3313 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3314 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3320 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3321 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3322 mod = _mm_packs_epi32(mod, mod);
3323 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3324 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3326 if (pixel == buffer_FragColorbgra8)
3327 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3333 void DPSOFTRAST_VertexShader_Lightmap(void)
3335 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3336 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3337 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3340 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3343 unsigned char * RESTRICT pixelmask = span->pixelmask;
3344 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3345 int x, startx = span->startx, endx = span->endx;
3346 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3347 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3348 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3349 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3350 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3351 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3352 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3353 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3354 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3355 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3356 pixel = buffer_FragColorbgra8;
3357 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3358 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3359 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3360 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3361 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3362 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3363 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3364 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3366 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3367 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3368 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3369 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3370 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3371 for (x = startx;x < endx;x++)
3373 __m128i color, lightmap, glow, pix;
3374 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3377 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3378 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3379 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3380 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3381 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3382 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3383 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3384 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3385 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3386 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3392 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3393 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3394 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3395 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3396 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3397 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3402 for (x = startx;x < endx;x++)
3404 __m128i color, lightmap, pix;
3405 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3408 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3409 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3410 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3411 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3412 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3413 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3414 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3420 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3421 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3422 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3423 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3426 if (pixel == buffer_FragColorbgra8)
3427 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3432 void DPSOFTRAST_VertexShader_LightDirection(void);
3433 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3435 void DPSOFTRAST_VertexShader_FakeLight(void)
3437 DPSOFTRAST_VertexShader_LightDirection();
3440 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3442 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3447 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3449 DPSOFTRAST_VertexShader_LightDirection();
3450 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3453 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3455 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3460 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3462 DPSOFTRAST_VertexShader_LightDirection();
3463 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3466 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3468 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3473 void DPSOFTRAST_VertexShader_LightDirection(void)
3476 int numvertices = dpsoftrast.numvertices;
3478 float LightVector[4];
3479 float EyePosition[4];
3480 float EyeVectorModelSpace[4];
3486 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3487 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3488 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3489 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3490 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3491 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3492 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3493 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3494 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3495 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3496 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3497 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3498 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3499 for (i = 0;i < numvertices;i++)
3501 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3502 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3503 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3504 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3505 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3506 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3507 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3508 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3509 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3510 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3511 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3512 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3513 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3514 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3515 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3516 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3517 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3518 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3519 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3520 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3521 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3522 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3523 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3524 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3525 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3526 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3527 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3528 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3529 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3531 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3534 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3535 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3536 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3537 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3538 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3539 #define DPSOFTRAST_Vector3Normalize(v)\
3542 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3553 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3555 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3556 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3557 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3558 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3559 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3560 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3561 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3562 unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3563 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3564 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3565 int x, startx = span->startx, endx = span->endx;
3566 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3567 float LightVectordata[4];
3568 float LightVectorslope[4];
3569 float EyeVectordata[4];
3570 float EyeVectorslope[4];
3571 float VectorSdata[4];
3572 float VectorSslope[4];
3573 float VectorTdata[4];
3574 float VectorTslope[4];
3575 float VectorRdata[4];
3576 float VectorRslope[4];
3578 float diffusetex[4];
3580 float surfacenormal[4];
3581 float lightnormal[4];
3582 float lightnormal_modelspace[4];
3584 float specularnormal[4];
3587 float SpecularPower;
3589 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3590 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3591 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3592 Color_Glow[3] = 0.0f;
3593 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3594 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3595 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3596 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3597 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3598 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3599 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3600 Color_Pants[3] = 0.0f;
3601 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3602 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3603 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3604 Color_Shirt[3] = 0.0f;
3605 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3606 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3607 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3609 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3610 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3612 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3614 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3616 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3618 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3619 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3620 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3621 Color_Diffuse[3] = 0.0f;
3622 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3623 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3624 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3625 LightColor[3] = 0.0f;
3626 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3627 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3628 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3629 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3630 Color_Specular[3] = 0.0f;
3631 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3632 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3633 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3635 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3637 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3638 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3639 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3640 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3641 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3643 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3645 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3646 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3648 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3650 // nothing of this needed
3654 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3657 for (x = startx;x < endx;x++)
3660 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3661 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3662 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3663 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3664 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3666 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3667 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3668 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3669 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3671 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3672 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3673 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3674 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3675 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3676 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3677 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3678 DPSOFTRAST_Vector3Normalize(surfacenormal);
3680 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3682 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3683 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3684 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3685 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3687 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3688 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3689 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3690 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3692 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3693 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3694 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3695 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3697 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3698 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3699 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3700 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3702 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3703 DPSOFTRAST_Vector3Normalize(lightnormal);
3705 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3707 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3708 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3709 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3710 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3713 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3715 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3716 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3717 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3719 float f = 1.0f / 256.0f;
3720 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3721 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3722 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3725 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3727 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3728 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3729 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3730 DPSOFTRAST_Vector3Normalize(lightnormal);
3732 LightColor[0] = 1.0;
3733 LightColor[1] = 1.0;
3734 LightColor[2] = 1.0;
3738 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3739 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3740 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3741 DPSOFTRAST_Vector3Normalize(lightnormal);
3744 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3746 if(thread->shader_exactspecularmath)
3748 // reflect lightnormal at surfacenormal, take the negative of that
3749 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3751 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3752 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3753 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3754 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3756 // dot of this and normalize(EyeVectorFogDepth.xyz)
3757 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3758 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3759 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3760 DPSOFTRAST_Vector3Normalize(eyenormal);
3762 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3766 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3767 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3768 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3769 DPSOFTRAST_Vector3Normalize(eyenormal);
3771 specularnormal[0] = lightnormal[0] + eyenormal[0];
3772 specularnormal[1] = lightnormal[1] + eyenormal[1];
3773 specularnormal[2] = lightnormal[2] + eyenormal[2];
3774 DPSOFTRAST_Vector3Normalize(specularnormal);
3776 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3779 specular = pow(specular, SpecularPower * glosstex[3]);
3780 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3782 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3783 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3784 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3785 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3789 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3790 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3791 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3792 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3795 buffer_FragColorbgra8[x*4+0] = d[0];
3796 buffer_FragColorbgra8[x*4+1] = d[1];
3797 buffer_FragColorbgra8[x*4+2] = d[2];
3798 buffer_FragColorbgra8[x*4+3] = d[3];
3801 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3803 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3804 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3805 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3806 Color_Diffuse[3] = 0.0f;
3807 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3808 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3809 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3810 LightColor[3] = 0.0f;
3811 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3813 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3815 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3816 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3817 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3818 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3819 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3821 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3823 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3824 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3826 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3828 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3832 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3835 for (x = startx;x < endx;x++)
3838 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3839 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3840 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3841 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3842 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3843 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3844 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3845 DPSOFTRAST_Vector3Normalize(surfacenormal);
3847 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3849 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3850 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3851 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3852 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3854 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3855 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3856 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3857 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3859 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3860 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3861 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3862 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3864 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3865 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3866 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3867 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3869 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3870 DPSOFTRAST_Vector3Normalize(lightnormal);
3872 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3874 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3875 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3876 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3877 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3880 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3882 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3883 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3884 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3886 float f = 1.0f / 256.0f;
3887 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3888 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3889 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3892 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3894 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3895 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3896 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3897 DPSOFTRAST_Vector3Normalize(lightnormal);
3899 LightColor[0] = 1.0;
3900 LightColor[1] = 1.0;
3901 LightColor[2] = 1.0;
3905 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3906 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3907 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3908 DPSOFTRAST_Vector3Normalize(lightnormal);
3911 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3912 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3914 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3915 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3916 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3917 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3921 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3922 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3923 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3924 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3926 buffer_FragColorbgra8[x*4+0] = d[0];
3927 buffer_FragColorbgra8[x*4+1] = d[1];
3928 buffer_FragColorbgra8[x*4+2] = d[2];
3929 buffer_FragColorbgra8[x*4+3] = d[3];
3934 for (x = startx;x < endx;x++)
3937 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3938 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3939 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3940 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3942 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3944 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3945 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3946 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3947 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3951 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3952 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3953 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3954 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3956 buffer_FragColorbgra8[x*4+0] = d[0];
3957 buffer_FragColorbgra8[x*4+1] = d[1];
3958 buffer_FragColorbgra8[x*4+2] = d[2];
3959 buffer_FragColorbgra8[x*4+3] = d[3];
3962 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3967 void DPSOFTRAST_VertexShader_LightSource(void)
3970 int numvertices = dpsoftrast.numvertices;
3971 float LightPosition[4];
3972 float LightVector[4];
3973 float LightVectorModelSpace[4];
3974 float EyePosition[4];
3975 float EyeVectorModelSpace[4];
3981 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3982 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3983 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3984 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3985 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3986 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3987 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3988 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3989 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3990 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3991 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3992 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3993 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3994 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3995 for (i = 0;i < numvertices;i++)
3997 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3998 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3999 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4000 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4001 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4002 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4003 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4004 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4005 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4006 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4007 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4008 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4009 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4010 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4011 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4012 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4013 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4014 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
4015 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4016 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4017 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4018 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4019 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4020 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4021 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4022 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4023 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4024 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4025 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4026 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4027 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4028 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4030 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4031 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4034 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4037 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4038 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4039 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4040 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4041 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4042 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4043 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4044 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4045 int x, startx = span->startx, endx = span->endx;
4046 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4047 float CubeVectordata[4];
4048 float CubeVectorslope[4];
4049 float LightVectordata[4];
4050 float LightVectorslope[4];
4051 float EyeVectordata[4];
4052 float EyeVectorslope[4];
4054 float diffusetex[4];
4056 float surfacenormal[4];
4057 float lightnormal[4];
4059 float specularnormal[4];
4062 float SpecularPower;
4063 float CubeVector[4];
4066 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4067 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4068 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4069 Color_Glow[3] = 0.0f;
4070 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4071 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4072 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4073 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4074 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4075 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4076 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4077 Color_Diffuse[3] = 0.0f;
4078 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4079 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4080 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4081 Color_Specular[3] = 0.0f;
4082 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4083 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4084 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4085 Color_Pants[3] = 0.0f;
4086 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4087 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4088 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4089 Color_Shirt[3] = 0.0f;
4090 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4091 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4092 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4093 LightColor[3] = 0.0f;
4094 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4095 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4096 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4097 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4098 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4099 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4100 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4101 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4103 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4104 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4106 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4107 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4108 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4110 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4111 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4112 for (x = startx;x < endx;x++)
4115 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4116 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4117 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4118 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4119 if (attenuation < 0.01f)
4121 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4123 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4124 if (attenuation < 0.01f)
4128 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4129 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4130 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4131 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4132 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4134 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4135 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4136 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4137 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4139 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4140 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4141 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4142 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4143 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4144 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4145 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4146 DPSOFTRAST_Vector3Normalize(surfacenormal);
4148 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4149 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4150 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4151 DPSOFTRAST_Vector3Normalize(lightnormal);
4153 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4155 if(thread->shader_exactspecularmath)
4157 // reflect lightnormal at surfacenormal, take the negative of that
4158 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4160 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4161 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4162 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4163 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4165 // dot of this and normalize(EyeVectorFogDepth.xyz)
4166 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4167 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4168 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4169 DPSOFTRAST_Vector3Normalize(eyenormal);
4171 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4175 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4176 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4177 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4178 DPSOFTRAST_Vector3Normalize(eyenormal);
4180 specularnormal[0] = lightnormal[0] + eyenormal[0];
4181 specularnormal[1] = lightnormal[1] + eyenormal[1];
4182 specularnormal[2] = lightnormal[2] + eyenormal[2];
4183 DPSOFTRAST_Vector3Normalize(specularnormal);
4185 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4187 specular = pow(specular, SpecularPower * glosstex[3]);
4189 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4191 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4192 attenuation *= (1.0f / 255.0f);
4193 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4194 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4195 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4196 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4200 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4201 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4202 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4203 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4205 buffer_FragColorbgra8[x*4+0] = d[0];
4206 buffer_FragColorbgra8[x*4+1] = d[1];
4207 buffer_FragColorbgra8[x*4+2] = d[2];
4208 buffer_FragColorbgra8[x*4+3] = d[3];
4211 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4213 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4214 for (x = startx;x < endx;x++)
4217 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4218 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4219 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4220 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4221 if (attenuation < 0.01f)
4223 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4225 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4226 if (attenuation < 0.01f)
4230 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4231 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4232 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4233 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4234 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4236 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4237 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4238 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4239 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4241 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4242 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4243 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4244 DPSOFTRAST_Vector3Normalize(surfacenormal);
4246 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4247 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4248 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4249 DPSOFTRAST_Vector3Normalize(lightnormal);
4251 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4252 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4254 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4255 attenuation *= (1.0f / 255.0f);
4256 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4257 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4258 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4259 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4263 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4264 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4265 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4266 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4268 buffer_FragColorbgra8[x*4+0] = d[0];
4269 buffer_FragColorbgra8[x*4+1] = d[1];
4270 buffer_FragColorbgra8[x*4+2] = d[2];
4271 buffer_FragColorbgra8[x*4+3] = d[3];
4276 for (x = startx;x < endx;x++)
4279 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4280 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4281 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4282 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4283 if (attenuation < 0.01f)
4285 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4287 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4288 if (attenuation < 0.01f)
4292 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4293 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4294 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4295 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4296 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4298 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4299 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4300 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4301 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4303 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4305 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4306 attenuation *= (1.0f / 255.0f);
4307 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4308 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4309 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4310 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4314 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4315 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4316 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4317 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4319 buffer_FragColorbgra8[x*4+0] = d[0];
4320 buffer_FragColorbgra8[x*4+1] = d[1];
4321 buffer_FragColorbgra8[x*4+2] = d[2];
4322 buffer_FragColorbgra8[x*4+3] = d[3];
4325 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4331 void DPSOFTRAST_VertexShader_Refraction(void)
4333 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4334 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4335 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4338 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4340 // DIRTY TRICK: only do sideways displacement. Not correct, but cheaper and thus better for SW.
4342 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4344 int x, startx = span->startx, endx = span->endx;
4347 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4348 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4351 float ModelViewProjectionPositiondata[4];
4352 float ModelViewProjectionPositionslope[4];
4355 float ScreenScaleRefractReflect[2];
4356 float ScreenCenterRefractReflect[2];
4357 float DistortScaleRefractReflect[2];
4358 float RefractColor[4];
4360 const unsigned char * RESTRICT pixelbase;
4361 const unsigned char * RESTRICT pixel[4];
4362 DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4363 if(!texture) return;
4364 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[0][0];
4367 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4368 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4371 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
4374 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4375 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4376 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4377 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4378 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4379 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4380 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4381 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4382 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4383 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4386 for (x = startx;x < endx;x++)
4388 float SafeScreenTexCoord[2];
4389 float ScreenTexCoord[2];
4396 // " vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4397 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4399 // " vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4400 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4401 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4403 // " vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4404 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4405 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4406 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4407 DPSOFTRAST_Vector3Normalize(v);
4408 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4409 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4411 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4412 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4414 unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<12) - 2048, ScreenTexCoord[1] * (texture->mipmap[0][3]<<12) - 2048};
4415 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
4416 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
4417 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
4418 int tci[2] = { tc[0]>>12, tc[1]>>12 };
4419 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4420 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4421 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4422 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4423 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4424 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4425 pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci1[0]);
4426 pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci[0]);
4427 pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci1[0]);
4428 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
4429 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
4430 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
4434 int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2], ScreenTexCoord[1] * texture->mipmap[0][3] };
4435 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4436 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4437 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4443 //p = (int) bound(startx, x + (ScreenTexCoord[0] - SafeScreenTexCoord[0]) / (ModelViewProjectionPositionslope[0]*z), endx-1);
4444 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4445 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4446 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4447 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4450 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4455 void DPSOFTRAST_VertexShader_Water(void)
4457 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4461 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4464 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4465 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4466 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4467 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4468 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4473 void DPSOFTRAST_VertexShader_ShowDepth(void)
4475 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4478 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4481 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4482 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4483 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4484 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4485 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4490 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4492 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4495 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4498 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4499 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4500 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4501 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4502 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4507 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4509 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4512 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4515 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4516 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4517 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4518 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4519 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4524 typedef struct DPSOFTRAST_ShaderModeInfo_s
4527 void (*Vertex)(void);
4528 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4529 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4530 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4532 DPSOFTRAST_ShaderModeInfo;
4534 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4536 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4537 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4538 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4539 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4540 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4541 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4542 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4543 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4544 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4545 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4546 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4547 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4548 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4549 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4550 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4551 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4554 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4559 unsigned int *depthpixel;
4563 unsigned char *pixelmask;
4564 DPSOFTRAST_State_Triangle *triangle;
4565 triangle = &thread->triangles[span->triangle];
4566 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4567 startx = span->startx;
4569 depth = span->depthbase;
4570 depthslope = span->depthslope;
4571 pixelmask = thread->pixelmaskarray;
4572 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4574 switch(thread->fb_depthfunc)
4577 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4578 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4579 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4580 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4581 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4582 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4583 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4585 while (startx < endx && !pixelmask[startx])
4587 while (endx > startx && !pixelmask[endx-1])
4592 // no depth testing means we're just dealing with color...
4593 memset(pixelmask + startx, 1, endx - startx);
4595 span->pixelmask = pixelmask;
4596 span->startx = startx;
4600 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4602 int x, d, depth, depthslope, startx, endx;
4603 const unsigned char *pixelmask;
4604 unsigned int *depthpixel;
4605 if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4607 depth = span->depthbase;
4608 depthslope = span->depthslope;
4609 pixelmask = span->pixelmask;
4610 startx = span->startx;
4612 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4613 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4619 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4622 DPSOFTRAST_State_Triangle *triangle;
4623 DPSOFTRAST_State_Span *span;
4624 for (i = 0; i < thread->numspans; i++)
4626 span = &thread->spans[i];
4627 triangle = &thread->triangles[span->triangle];
4628 DPSOFTRAST_Draw_DepthTest(thread, span);
4629 if (span->startx >= span->endx)
4631 // run pixel shader if appropriate
4632 // do this before running depthmask code, to allow the pixelshader
4633 // to clear pixelmask values for alpha testing
4634 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4635 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4636 DPSOFTRAST_Draw_DepthWrite(thread, span);
4638 thread->numspans = 0;
4641 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4643 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4646 int cullface = thread->cullface;
4647 int minx, maxx, miny, maxy;
4648 int miny1, maxy1, miny2, maxy2;
4649 __m128i fbmin, fbmax;
4650 __m128 viewportcenter, viewportscale;
4651 int firstvertex = command->firstvertex;
4652 int numvertices = command->numvertices;
4653 int numtriangles = command->numtriangles;
4654 const int *element3i = command->element3i;
4655 const unsigned short *element3s = command->element3s;
4656 int clipped = command->clipped;
4663 int starty, endy, bandy;
4667 float clip0origin, clip0slope;
4669 __m128 triangleedge1, triangleedge2, trianglenormal;
4672 DPSOFTRAST_State_Triangle *triangle;
4673 DPSOFTRAST_Texture *texture;
4674 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4675 miny = thread->fb_scissor[1];
4676 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4677 miny1 = bound(miny, thread->miny1, maxy);
4678 maxy1 = bound(miny, thread->maxy1, maxy);
4679 miny2 = bound(miny, thread->miny2, maxy);
4680 maxy2 = bound(miny, thread->maxy2, maxy);
4681 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4683 if (!ATOMIC_DECREMENT(command->refcount))
4685 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4686 MM_FREE(command->arrays);
4690 minx = thread->fb_scissor[0];
4691 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4692 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4693 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4694 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4695 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4696 screen[3] = _mm_setzero_ps();
4697 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4698 for (i = 0;i < numtriangles;i++)
4700 const float *screencoord4f = command->arrays;
4701 const float *arrays = screencoord4f + numvertices*4;
4703 // generate the 3 edges of this triangle
4704 // generate spans for the triangle - switch based on left split or right split classification of triangle
4707 e[0] = element3s[i*3+0] - firstvertex;
4708 e[1] = element3s[i*3+1] - firstvertex;
4709 e[2] = element3s[i*3+2] - firstvertex;
4713 e[0] = element3i[i*3+0] - firstvertex;
4714 e[1] = element3i[i*3+1] - firstvertex;
4715 e[2] = element3i[i*3+2] - firstvertex;
4724 #define SKIPBACKFACE \
4725 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4726 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4727 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4728 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4729 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4733 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4737 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4742 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4743 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4745 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4746 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4748 #define CLIPPEDVERTEXCOPY(k,p1) \
4749 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4751 #define GENATTRIBCOPY(attrib, p1) \
4752 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4753 #define GENATTRIBLERP(attrib, p1, p2) \
4755 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4756 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4758 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4762 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4763 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4764 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4765 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4766 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4767 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4768 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4774 // calculate distance from nearplane
4775 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4776 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4777 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4778 if (clipdist[0] >= 0.0f)
4780 if (clipdist[1] >= 0.0f)
4782 if (clipdist[2] >= 0.0f)
4785 // triangle is entirely in front of nearplane
4786 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4793 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4801 if (clipdist[2] >= 0.0f)
4803 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4810 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4817 else if (clipdist[1] >= 0.0f)
4819 if (clipdist[2] >= 0.0f)
4821 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4828 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4834 else if (clipdist[2] >= 0.0f)
4836 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4841 else continue; // triangle is entirely behind nearplane
4844 // calculate integer y coords for triangle points
4845 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4846 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4847 screenmin = _mm_min_epi16(screeni, screenir),
4848 screenmax = _mm_max_epi16(screeni, screenir);
4849 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4850 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4851 screenmin = _mm_max_epi16(screenmin, fbmin);
4852 screenmax = _mm_min_epi16(screenmax, fbmax);
4853 // skip offscreen triangles
4854 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4856 starty = _mm_extract_epi16(screenmin, 1);
4857 endy = _mm_extract_epi16(screenmax, 1)+1;
4858 if (starty >= maxy1 && endy <= miny2)
4860 screeny = _mm_srai_epi32(screeni, 16);
4863 triangle = &thread->triangles[thread->numtriangles];
4865 // calculate attribute plans for triangle data...
4866 // okay, this triangle is going to produce spans, we'd better project
4867 // the interpolants now (this is what gives perspective texturing),
4868 // this consists of simply multiplying all arrays by the W coord
4869 // (which is basically 1/Z), which will be undone per-pixel
4870 // (multiplying by Z again) to get the perspective-correct array
4873 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4874 __m128 mipedgescale, mipdensity;
4875 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4876 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4877 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4878 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4879 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4880 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4881 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4882 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4883 attribedge1 = _mm_sub_ss(w0, w1);
4884 attribedge2 = _mm_sub_ss(w2, w1);
4885 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4886 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4887 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4888 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4889 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4890 _mm_store_ss(&triangle->w[0], attribxslope);
4891 _mm_store_ss(&triangle->w[1], attribyslope);
4892 _mm_store_ss(&triangle->w[2], attriborigin);
4897 if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
4899 float cliporigin, clipxslope, clipyslope;
4900 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
4901 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
4902 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
4903 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4904 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4905 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4906 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
4907 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
4908 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
4911 clip0origin = -cliporigin/clipxslope;
4912 clip0slope = -clipyslope/clipxslope;
4913 clip0dir = clipxslope > 0 ? 1 : -1;
4915 else if(clipyslope > 0)
4917 clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
4918 clip0slope = dpsoftrast.fb_width;
4921 else if(clipyslope < 0)
4923 clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
4924 clip0slope = -dpsoftrast.fb_width;
4927 else if(clip0origin < 0) continue;
4930 mipedgescale = _mm_setzero_ps();
4931 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4933 __m128 attrib0, attrib1, attrib2;
4934 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4935 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4937 arrays += numvertices*4;
4938 GENATTRIBS(attrib0, attrib1, attrib2);
4939 attriborigin = _mm_mul_ps(attrib1, w1);
4940 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4941 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4942 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4943 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4944 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4945 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
4946 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
4947 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
4948 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4950 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4951 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4952 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4953 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4957 memset(triangle->mip, 0, sizeof(triangle->mip));
4958 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4960 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4961 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4963 texture = thread->texbound[texunit];
4964 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4966 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4967 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4968 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4969 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4970 // this will be multiplied in the texturing routine by the texture resolution
4971 y = _mm_cvtss_si32(mipdensity);
4974 y = (int)(log((float)y)*0.5f/M_LN2);
4975 if (y > texture->mipmaps - 1)
4976 y = texture->mipmaps - 1;
4977 triangle->mip[texunit] = y;
4983 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4986 __m128 xcoords, xslope;
4987 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4988 int yccmask = _mm_movemask_epi8(ycc);
4989 int edge0p, edge0n, edge1p, edge1n;
4998 case 0xFFFF: /*0000*/ y = endy; continue;
4999 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5000 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5001 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5002 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5003 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5004 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5005 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5006 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5007 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5008 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5009 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5010 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5011 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5012 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5013 case 0x0000: /*1111*/ y++; continue;
5021 case 0xFFFF: /*000*/ y = endy; continue;
5022 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5023 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5024 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5025 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5026 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5027 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5028 case 0x0000: /*111*/ y++; continue;
5031 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5032 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5033 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5034 nexty = _mm_extract_epi16(ycc, 0);
5035 if (nexty >= bandy) nexty = bandy-1;
5036 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5037 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5038 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5039 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5040 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5041 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5043 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5044 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5046 clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5047 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5049 int startx, endx, offset;
5050 startx = _mm_cvtss_si32(xcoords);
5051 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5052 if (startx < minx) startx = minx;
5053 if (endx > maxx) endx = maxx;
5054 if (startx >= endx) continue;
5062 if(endx <= clip0) continue;
5063 startx = (int)clip0;
5066 else if (endx > clip0)
5068 if(startx >= clip0) continue;
5073 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5075 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5076 span->triangle = thread->numtriangles;
5080 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5081 if (span->startx >= span->endx)
5083 wslope = triangle->w[0];
5084 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5085 span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5086 span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5087 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5088 DPSOFTRAST_Draw_ProcessSpans(thread);
5093 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5095 DPSOFTRAST_Draw_ProcessSpans(thread);
5096 thread->numtriangles = 0;
5100 if (!ATOMIC_DECREMENT(command->refcount))
5102 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5103 MM_FREE(command->arrays);
5106 if (thread->numspans > 0 || thread->numtriangles > 0)
5108 DPSOFTRAST_Draw_ProcessSpans(thread);
5109 thread->numtriangles = 0;
5114 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5118 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5119 int datasize = 2*numvertices*sizeof(float[4]);
5120 DPSOFTRAST_Command_Draw *command;
5121 unsigned char *data;
5122 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5124 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5125 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5127 datasize += numvertices*sizeof(float[4]);
5130 datasize += numtriangles*sizeof(unsigned short[3]);
5132 datasize += numtriangles*sizeof(int[3]);
5133 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5134 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5136 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5137 data = (unsigned char *)MM_CALLOC(datasize, 1);
5141 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5142 data = (unsigned char *)command + commandsize;
5144 command->firstvertex = firstvertex;
5145 command->numvertices = numvertices;
5146 command->numtriangles = numtriangles;
5147 command->arrays = (float *)data;
5148 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5149 dpsoftrast.firstvertex = firstvertex;
5150 dpsoftrast.numvertices = numvertices;
5151 dpsoftrast.screencoord4f = (float *)data;
5152 data += numvertices*sizeof(float[4]);
5153 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5154 data += numvertices*sizeof(float[4]);
5155 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5157 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5158 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5160 dpsoftrast.post_array4f[j] = (float *)data;
5161 data += numvertices*sizeof(float[4]);
5163 command->element3i = NULL;
5164 command->element3s = NULL;
5167 command->element3s = (unsigned short *)data;
5168 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5172 command->element3i = (int *)data;
5173 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5178 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5180 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5181 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5182 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5183 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5184 if (command->starty >= command->endy)
5186 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5187 MM_FREE(command->arrays);
5188 DPSOFTRAST_UndoCommand(command->commandsize);
5191 command->clipped = dpsoftrast.drawclipped;
5192 command->refcount = dpsoftrast.numthreads;
5194 if (dpsoftrast.usethreads)
5197 DPSOFTRAST_Draw_SyncCommands();
5198 for (i = 0; i < dpsoftrast.numthreads; i++)
5200 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5201 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5202 Thread_CondSignal(thread->drawcond);
5207 DPSOFTRAST_Draw_FlushThreads();
5211 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5212 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5214 thread->validate |= DPSOFTRAST_VALIDATE_FB;
5216 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5218 DPSOFTRAST_Command_SetRenderTargets *command;
5219 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5220 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5221 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5223 dpsoftrast.fb_width = width;
5224 dpsoftrast.fb_height = height;
5225 dpsoftrast.fb_depthpixels = depthpixels;
5226 dpsoftrast.fb_colorpixels[0] = colorpixels0;
5227 dpsoftrast.fb_colorpixels[1] = colorpixels1;
5228 dpsoftrast.fb_colorpixels[2] = colorpixels2;
5229 dpsoftrast.fb_colorpixels[3] = colorpixels3;
5230 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5231 command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5232 command->width = width;
5233 command->height = height;
5236 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5238 int commandoffset = thread->commandoffset;
5239 while (commandoffset != endoffset)
5241 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5242 switch (command->opcode)
5244 #define INTERPCOMMAND(name) \
5245 case DPSOFTRAST_OPCODE_##name : \
5246 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5247 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5248 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5249 commandoffset = 0; \
5251 INTERPCOMMAND(Viewport)
5252 INTERPCOMMAND(ClearColor)
5253 INTERPCOMMAND(ClearDepth)
5254 INTERPCOMMAND(ColorMask)
5255 INTERPCOMMAND(DepthTest)
5256 INTERPCOMMAND(ScissorTest)
5257 INTERPCOMMAND(Scissor)
5258 INTERPCOMMAND(BlendFunc)
5259 INTERPCOMMAND(BlendSubtract)
5260 INTERPCOMMAND(DepthMask)
5261 INTERPCOMMAND(DepthFunc)
5262 INTERPCOMMAND(DepthRange)
5263 INTERPCOMMAND(PolygonOffset)
5264 INTERPCOMMAND(CullFace)
5265 INTERPCOMMAND(AlphaTest)
5266 INTERPCOMMAND(AlphaFunc)
5267 INTERPCOMMAND(SetTexture)
5268 INTERPCOMMAND(SetShader)
5269 INTERPCOMMAND(Uniform4f)
5270 INTERPCOMMAND(UniformMatrix4f)
5271 INTERPCOMMAND(Uniform1i)
5272 INTERPCOMMAND(SetRenderTargets)
5273 INTERPCOMMAND(ClipPlane)
5275 case DPSOFTRAST_OPCODE_Draw:
5276 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5277 commandoffset += command->commandsize;
5278 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5280 thread->commandoffset = commandoffset;
5283 case DPSOFTRAST_OPCODE_Reset:
5288 thread->commandoffset = commandoffset;
5291 static int DPSOFTRAST_Draw_Thread(void *data)
5293 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5294 while(thread->index >= 0)
5296 if (thread->commandoffset != dpsoftrast.drawcommand)
5298 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5302 Thread_LockMutex(thread->drawmutex);
5303 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5305 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5306 thread->starving = true;
5307 Thread_CondWait(thread->drawcond, thread->drawmutex);
5308 thread->starving = false;
5310 Thread_UnlockMutex(thread->drawmutex);
5316 static void DPSOFTRAST_Draw_FlushThreads(void)
5318 DPSOFTRAST_State_Thread *thread;
5320 DPSOFTRAST_Draw_SyncCommands();
5321 if (dpsoftrast.usethreads)
5323 for (i = 0; i < dpsoftrast.numthreads; i++)
5325 thread = &dpsoftrast.threads[i];
5326 if (thread->commandoffset != dpsoftrast.drawcommand)
5328 Thread_LockMutex(thread->drawmutex);
5329 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5330 Thread_CondSignal(thread->drawcond);
5331 Thread_UnlockMutex(thread->drawmutex);
5334 for (i = 0; i < dpsoftrast.numthreads; i++)
5336 thread = &dpsoftrast.threads[i];
5337 if (thread->commandoffset != dpsoftrast.drawcommand)
5339 Thread_LockMutex(thread->drawmutex);
5340 if (thread->commandoffset != dpsoftrast.drawcommand)
5342 thread->waiting = true;
5343 Thread_CondWait(thread->waitcond, thread->drawmutex);
5344 thread->waiting = false;
5346 Thread_UnlockMutex(thread->drawmutex);
5352 for (i = 0; i < dpsoftrast.numthreads; i++)
5354 thread = &dpsoftrast.threads[i];
5355 if (thread->commandoffset != dpsoftrast.drawcommand)
5356 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5359 dpsoftrast.commandpool.usedcommands = 0;
5362 void DPSOFTRAST_Flush(void)
5364 DPSOFTRAST_Draw_FlushThreads();
5367 void DPSOFTRAST_Finish(void)
5372 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5382 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5383 dpsoftrast.bigendian = u.b[3];
5384 dpsoftrast.fb_width = width;
5385 dpsoftrast.fb_height = height;
5386 dpsoftrast.fb_depthpixels = depthpixels;
5387 dpsoftrast.fb_colorpixels[0] = colorpixels;
5388 dpsoftrast.fb_colorpixels[1] = NULL;
5389 dpsoftrast.fb_colorpixels[1] = NULL;
5390 dpsoftrast.fb_colorpixels[1] = NULL;
5391 dpsoftrast.viewport[0] = 0;
5392 dpsoftrast.viewport[1] = 0;
5393 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5394 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5395 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5396 dpsoftrast.texture_firstfree = 1;
5397 dpsoftrast.texture_end = 1;
5398 dpsoftrast.texture_max = 0;
5399 dpsoftrast.color[0] = 1;
5400 dpsoftrast.color[1] = 1;
5401 dpsoftrast.color[2] = 1;
5402 dpsoftrast.color[3] = 1;
5403 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5404 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5405 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5406 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5407 for (i = 0; i < dpsoftrast.numthreads; i++)
5409 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5411 thread->cullface = GL_BACK;
5412 thread->colormask[0] = 1;
5413 thread->colormask[1] = 1;
5414 thread->colormask[2] = 1;
5415 thread->colormask[3] = 1;
5416 thread->blendfunc[0] = GL_ONE;
5417 thread->blendfunc[1] = GL_ZERO;
5418 thread->depthmask = true;
5419 thread->depthtest = true;
5420 thread->depthfunc = GL_LEQUAL;
5421 thread->scissortest = false;
5422 thread->alphatest = false;
5423 thread->alphafunc = GL_GREATER;
5424 thread->alphavalue = 0.5f;
5425 thread->viewport[0] = 0;
5426 thread->viewport[1] = 0;
5427 thread->viewport[2] = dpsoftrast.fb_width;
5428 thread->viewport[3] = dpsoftrast.fb_height;
5429 thread->scissor[0] = 0;
5430 thread->scissor[1] = 0;
5431 thread->scissor[2] = dpsoftrast.fb_width;
5432 thread->scissor[3] = dpsoftrast.fb_height;
5433 thread->depthrange[0] = 0;
5434 thread->depthrange[1] = 1;
5435 thread->polygonoffset[0] = 0;
5436 thread->polygonoffset[1] = 0;
5437 thread->clipplane[0] = 0;
5438 thread->clipplane[1] = 0;
5439 thread->clipplane[2] = 0;
5440 thread->clipplane[3] = 1;
5442 thread->numspans = 0;
5443 thread->numtriangles = 0;
5444 thread->commandoffset = 0;
5445 thread->waiting = false;
5446 thread->starving = false;
5448 thread->validate = -1;
5449 DPSOFTRAST_Validate(thread, -1);
5451 if (dpsoftrast.usethreads)
5453 thread->waitcond = Thread_CreateCond();
5454 thread->drawcond = Thread_CreateCond();
5455 thread->drawmutex = Thread_CreateMutex();
5456 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5462 void DPSOFTRAST_Shutdown(void)
5465 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5467 DPSOFTRAST_State_Thread *thread;
5468 for (i = 0; i < dpsoftrast.numthreads; i++)
5470 thread = &dpsoftrast.threads[i];
5471 Thread_LockMutex(thread->drawmutex);
5473 Thread_CondSignal(thread->drawcond);
5474 Thread_UnlockMutex(thread->drawmutex);
5475 Thread_WaitThread(thread->thread, 0);
5476 Thread_DestroyCond(thread->waitcond);
5477 Thread_DestroyCond(thread->drawcond);
5478 Thread_DestroyMutex(thread->drawmutex);
5481 for (i = 0;i < dpsoftrast.texture_end;i++)
5482 if (dpsoftrast.texture[i].bytes)
5483 MM_FREE(dpsoftrast.texture[i].bytes);
5484 if (dpsoftrast.texture)
5485 free(dpsoftrast.texture);
5486 if (dpsoftrast.threads)
5487 MM_FREE(dpsoftrast.threads);
5488 memset(&dpsoftrast, 0, sizeof(dpsoftrast));