3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 #pragma warning(disable : 4324)
14 typedef qboolean bool;
18 #define ATOMIC_SIZE 32
21 #if defined(__APPLE__)
22 #include <libkern/OSAtomic.h>
23 #define ALIGN(var) var __attribute__((__aligned__(16)))
24 #define ATOMIC(var) var __attribute__((__aligned__(32)))
25 #define MEMORY_BARRIER (_mm_sfence())
26 #define ATOMIC_COUNTER volatile int32_t
27 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30 #elif defined(__GNUC__)
31 #define ALIGN(var) var __attribute__((__aligned__(16)))
32 #define ATOMIC(var) var __attribute__((__aligned__(32)))
33 #define MEMORY_BARRIER (_mm_sfence())
34 //(__sync_synchronize())
35 #define ATOMIC_COUNTER volatile int
36 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
37 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
38 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
39 #elif defined(_MSC_VER)
40 #define ALIGN(var) __declspec(align(16)) var
41 #define ATOMIC(var) __declspec(align(32)) var
42 #define MEMORY_BARRIER (_mm_sfence())
44 #define ATOMIC_COUNTER volatile LONG
45 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
46 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
47 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
52 #define ALIGN(var) var
55 #define ATOMIC(var) var
57 #ifndef MEMORY_BARRIER
58 #define MEMORY_BARRIER ((void)0)
60 #ifndef ATOMIC_COUNTER
61 #define ATOMIC_COUNTER int
63 #ifndef ATOMIC_INCREMENT
64 #define ATOMIC_INCREMENT(counter) (++(counter))
66 #ifndef ATOMIC_DECREMENT
67 #define ATOMIC_DECREMENT(counter) (--(counter))
70 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
74 #include <emmintrin.h>
76 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
78 static void *MM_CALLOC(size_t nmemb, size_t size)
80 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
81 if (ptr != NULL) memset(ptr, 0, nmemb*size);
85 #define MM_FREE _mm_free
87 #define MM_MALLOC(size) malloc(size)
88 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
92 typedef enum DPSOFTRAST_ARRAY_e
94 DPSOFTRAST_ARRAY_POSITION,
95 DPSOFTRAST_ARRAY_COLOR,
96 DPSOFTRAST_ARRAY_TEXCOORD0,
97 DPSOFTRAST_ARRAY_TEXCOORD1,
98 DPSOFTRAST_ARRAY_TEXCOORD2,
99 DPSOFTRAST_ARRAY_TEXCOORD3,
100 DPSOFTRAST_ARRAY_TEXCOORD4,
101 DPSOFTRAST_ARRAY_TEXCOORD5,
102 DPSOFTRAST_ARRAY_TEXCOORD6,
103 DPSOFTRAST_ARRAY_TEXCOORD7,
104 DPSOFTRAST_ARRAY_TOTAL
108 typedef struct DPSOFTRAST_Texture_s
115 DPSOFTRAST_TEXTURE_FILTER filter;
118 ATOMIC_COUNTER binds;
119 unsigned char *bytes;
120 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
124 #define COMMAND_SIZE ALIGN_SIZE
125 #define COMMAND_ALIGN(var) ALIGN(var)
127 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
129 unsigned char opcode;
130 unsigned short commandsize;
134 enum { DPSOFTRAST_OPCODE_Reset = 0 };
136 #define DEFCOMMAND(opcodeval, name, fields) \
137 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
138 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
140 unsigned char opcode; \
141 unsigned short commandsize; \
143 } DPSOFTRAST_Command_##name );
145 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
146 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
148 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
152 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
154 DPSOFTRAST_State_Command_Pool);
156 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
158 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
160 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
162 DPSOFTRAST_State_Triangle);
164 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
165 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
166 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
167 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
168 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
170 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
171 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
172 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
173 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
174 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
175 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
176 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
177 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
178 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
181 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
183 typedef ALIGN(struct DPSOFTRAST_State_Span_s
185 int triangle; // triangle this span was generated by
186 int x; // framebuffer x coord
187 int y; // framebuffer y coord
188 int startx; // usable range (according to pixelmask)
189 int endx; // usable range (according to pixelmask)
190 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
191 int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
192 int depthslope; // depthbuffer value pixel delta
194 DPSOFTRAST_State_Span);
196 #define DPSOFTRAST_DRAW_MAXSPANS 1024
197 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
198 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
200 #define DPSOFTRAST_VALIDATE_FB 1
201 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
202 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
203 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
205 typedef enum DPSOFTRAST_BLENDMODE_e
207 DPSOFTRAST_BLENDMODE_OPAQUE,
208 DPSOFTRAST_BLENDMODE_ALPHA,
209 DPSOFTRAST_BLENDMODE_ADDALPHA,
210 DPSOFTRAST_BLENDMODE_ADD,
211 DPSOFTRAST_BLENDMODE_INVMOD,
212 DPSOFTRAST_BLENDMODE_MUL,
213 DPSOFTRAST_BLENDMODE_MUL2,
214 DPSOFTRAST_BLENDMODE_SUBALPHA,
215 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
216 DPSOFTRAST_BLENDMODE_INVADD,
217 DPSOFTRAST_BLENDMODE_TOTAL
219 DPSOFTRAST_BLENDMODE;
221 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
240 float polygonoffset[2];
242 ALIGN(float fb_clipplane[4]);
245 int shader_permutation;
246 int shader_exactspecularmath;
248 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
250 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
251 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
253 // DPSOFTRAST_VALIDATE_ flags
256 // derived values (DPSOFTRAST_VALIDATE_FB)
259 ALIGN(float fb_viewportcenter[4]);
260 ALIGN(float fb_viewportscale[4]);
262 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
265 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
274 ATOMIC(volatile int commandoffset);
276 volatile bool waiting;
277 volatile bool starving;
284 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
285 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
286 unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
288 DPSOFTRAST_State_Thread);
290 typedef ATOMIC(struct DPSOFTRAST_State_s
294 unsigned int *fb_depthpixels;
295 unsigned int *fb_colorpixels[4];
298 ALIGN(float fb_viewportcenter[4]);
299 ALIGN(float fb_viewportscale[4]);
302 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
303 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
305 const float *pointer_vertex3f;
306 const float *pointer_color4f;
307 const unsigned char *pointer_color4ub;
308 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
311 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
312 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
313 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
317 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
318 float *screencoord4f;
324 int shader_permutation;
325 int shader_exactspecularmath;
329 int texture_firstfree;
330 DPSOFTRAST_Texture *texture;
335 const char *errorstring;
340 DPSOFTRAST_State_Thread *threads;
342 ATOMIC(volatile int drawcommand);
344 DPSOFTRAST_State_Command_Pool commandpool;
348 DPSOFTRAST_State dpsoftrast;
350 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
351 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
352 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
353 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
355 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
356 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
358 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
360 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
361 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
362 fb_viewportcenter[3] = 0.5f;
363 fb_viewportcenter[0] = 0.0f;
364 fb_viewportscale[1] = 0.5f * viewport[2];
365 fb_viewportscale[2] = -0.5f * viewport[3];
366 fb_viewportscale[3] = 0.5f;
367 fb_viewportscale[0] = 1.0f;
370 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
372 if (dpsoftrast.interlace)
374 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
375 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
376 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
377 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
381 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
382 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
386 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
388 thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
389 thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
390 thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
391 thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
392 thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
395 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
397 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
398 // and viewport projection values
401 x1 = thread->scissor[0];
402 x2 = thread->scissor[0] + thread->scissor[2];
403 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
404 y2 = dpsoftrast.fb_height - thread->scissor[1];
405 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
407 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
409 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
410 thread->fb_scissor[0] = x1;
411 thread->fb_scissor[1] = y1;
412 thread->fb_scissor[2] = x2 - x1;
413 thread->fb_scissor[3] = y2 - y1;
415 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
416 DPSOFTRAST_RecalcClipPlane(thread);
417 DPSOFTRAST_RecalcThread(thread);
420 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
422 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
425 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
427 if (thread->blendsubtract)
429 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
431 #define BLENDFUNC(sfactor, dfactor, blendmode) \
432 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
433 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
434 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
439 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
441 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
442 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
443 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
444 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
445 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
446 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
447 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
448 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
449 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
450 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
451 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
456 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
458 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
460 mask &= thread->validate;
463 if (mask & DPSOFTRAST_VALIDATE_FB)
465 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
466 DPSOFTRAST_RecalcFB(thread);
468 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
470 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
471 DPSOFTRAST_RecalcDepthFunc(thread);
473 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
475 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
476 DPSOFTRAST_RecalcBlendFunc(thread);
480 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
482 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
483 return &dpsoftrast.texture[index];
487 static void DPSOFTRAST_Texture_Grow(void)
489 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
490 DPSOFTRAST_State_Thread *thread;
494 // expand texture array as needed
495 if (dpsoftrast.texture_max < 1024)
496 dpsoftrast.texture_max = 1024;
498 dpsoftrast.texture_max *= 2;
499 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
500 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
501 if (dpsoftrast.texbound[i])
502 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
503 for (j = 0; j < dpsoftrast.numthreads; j++)
505 thread = &dpsoftrast.threads[j];
506 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
507 if (thread->texbound[i])
508 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
512 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
521 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
522 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
523 DPSOFTRAST_Texture *texture;
524 if (width*height*depth < 1)
526 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
529 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
531 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
536 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
537 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
538 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
540 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
541 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
543 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
548 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
551 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
553 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
558 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
560 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
563 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
565 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
568 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
570 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
573 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
575 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
578 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
580 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
583 // find first empty slot in texture array
584 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
585 if (!dpsoftrast.texture[texnum].bytes)
587 dpsoftrast.texture_firstfree = texnum + 1;
588 if (dpsoftrast.texture_max <= texnum)
589 DPSOFTRAST_Texture_Grow();
590 if (dpsoftrast.texture_end <= texnum)
591 dpsoftrast.texture_end = texnum + 1;
592 texture = &dpsoftrast.texture[texnum];
593 memset(texture, 0, sizeof(*texture));
594 texture->flags = flags;
595 texture->width = width;
596 texture->height = height;
597 texture->depth = depth;
598 texture->sides = sides;
610 s = w * h * d * sides * 4;
611 texture->mipmap[mipmaps][0] = size;
612 texture->mipmap[mipmaps][1] = s;
613 texture->mipmap[mipmaps][2] = w;
614 texture->mipmap[mipmaps][3] = h;
615 texture->mipmap[mipmaps][4] = d;
618 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
624 texture->mipmaps = mipmaps;
625 texture->size = size;
627 // allocate the pixels now
628 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
632 void DPSOFTRAST_Texture_Free(int index)
634 DPSOFTRAST_Texture *texture;
635 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
639 MM_FREE(texture->bytes);
640 texture->bytes = NULL;
641 memset(texture, 0, sizeof(*texture));
642 // adjust the free range and used range
643 if (dpsoftrast.texture_firstfree > index)
644 dpsoftrast.texture_firstfree = index;
645 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
646 dpsoftrast.texture_end--;
648 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
650 int i, x, y, z, w, layer0, layer1, row0, row1;
651 unsigned char *o, *i0, *i1, *i2, *i3;
652 DPSOFTRAST_Texture *texture;
653 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
654 if (texture->mipmaps <= 1)
656 for (i = 1;i < texture->mipmaps;i++)
658 for (z = 0;z < texture->mipmap[i][4];z++)
662 if (layer1 >= texture->mipmap[i-1][4])
663 layer1 = texture->mipmap[i-1][4]-1;
664 for (y = 0;y < texture->mipmap[i][3];y++)
668 if (row1 >= texture->mipmap[i-1][3])
669 row1 = texture->mipmap[i-1][3]-1;
670 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
671 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
672 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
673 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
674 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
675 w = texture->mipmap[i][2];
678 if (texture->mipmap[i-1][2] > 1)
680 // average 3D texture
681 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
683 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
684 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
685 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
686 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
691 // average 3D mipmap with parent width == 1
692 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
694 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
695 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
696 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
697 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
703 if (texture->mipmap[i-1][2] > 1)
705 // average 2D texture (common case)
706 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
708 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
709 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
710 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
711 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
716 // 2D texture with parent width == 1
717 o[0] = (i0[0] + i1[0] + 1) >> 1;
718 o[1] = (i0[1] + i1[1] + 1) >> 1;
719 o[2] = (i0[2] + i1[2] + 1) >> 1;
720 o[3] = (i0[3] + i1[3] + 1) >> 1;
727 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
729 DPSOFTRAST_Texture *texture;
731 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
736 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
737 while (blockheight > 0)
739 memcpy(dst, pixels, blockwidth * 4);
740 pixels += blockwidth * 4;
741 dst += texture->mipmap[0][2] * 4;
745 DPSOFTRAST_Texture_CalculateMipmaps(index);
747 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
749 DPSOFTRAST_Texture *texture;
750 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
754 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
755 DPSOFTRAST_Texture_CalculateMipmaps(index);
757 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
759 DPSOFTRAST_Texture *texture;
760 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
761 return texture->mipmap[mip][2];
763 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
765 DPSOFTRAST_Texture *texture;
766 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
767 return texture->mipmap[mip][3];
769 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
771 DPSOFTRAST_Texture *texture;
772 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
773 return texture->mipmap[mip][4];
775 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
777 DPSOFTRAST_Texture *texture;
778 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
781 return texture->bytes + texture->mipmap[mip][0];
783 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
785 DPSOFTRAST_Texture *texture;
786 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
787 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
789 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
794 texture->filter = filter;
797 static void DPSOFTRAST_Draw_FlushThreads(void);
799 static void DPSOFTRAST_Draw_SyncCommands(void)
801 if(dpsoftrast.usethreads) MEMORY_BARRIER;
802 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
805 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
807 DPSOFTRAST_State_Thread *thread;
809 int freecommand = dpsoftrast.commandpool.freecommand;
810 int usedcommands = dpsoftrast.commandpool.usedcommands;
811 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
813 DPSOFTRAST_Draw_SyncCommands();
819 for (i = 0; i < dpsoftrast.numthreads; i++)
821 thread = &dpsoftrast.threads[i];
822 commandoffset = freecommand - thread->commandoffset;
823 if (commandoffset < 0)
824 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
825 if (commandoffset > usedcommands)
828 usedcommands = commandoffset;
831 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
833 thread = &dpsoftrast.threads[waitindex];
834 Thread_LockMutex(thread->drawmutex);
835 if (thread->commandoffset != dpsoftrast.drawcommand)
837 thread->waiting = true;
838 if (thread->starving) Thread_CondSignal(thread->drawcond);
839 Thread_CondWait(thread->waitcond, thread->drawmutex);
840 thread->waiting = false;
842 Thread_UnlockMutex(thread->drawmutex);
844 dpsoftrast.commandpool.usedcommands = usedcommands;
847 #define DPSOFTRAST_ALIGNCOMMAND(size) \
848 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
849 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
850 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
852 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
854 DPSOFTRAST_Command *command;
855 int freecommand = dpsoftrast.commandpool.freecommand;
856 int usedcommands = dpsoftrast.commandpool.usedcommands;
857 int extra = sizeof(DPSOFTRAST_Command);
858 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
859 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
860 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
862 if (dpsoftrast.usethreads)
863 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
865 DPSOFTRAST_Draw_FlushThreads();
866 freecommand = dpsoftrast.commandpool.freecommand;
867 usedcommands = dpsoftrast.commandpool.usedcommands;
869 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
871 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
872 command->opcode = DPSOFTRAST_OPCODE_Reset;
873 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
876 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
877 command->opcode = opcode;
878 command->commandsize = size;
880 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
882 dpsoftrast.commandpool.freecommand = freecommand;
883 dpsoftrast.commandpool.usedcommands = usedcommands + size;
887 static void DPSOFTRAST_UndoCommand(int size)
889 int freecommand = dpsoftrast.commandpool.freecommand;
890 int usedcommands = dpsoftrast.commandpool.usedcommands;
893 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
894 usedcommands -= size;
895 dpsoftrast.commandpool.freecommand = freecommand;
896 dpsoftrast.commandpool.usedcommands = usedcommands;
899 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
900 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
902 thread->viewport[0] = command->x;
903 thread->viewport[1] = command->y;
904 thread->viewport[2] = command->width;
905 thread->viewport[3] = command->height;
906 thread->validate |= DPSOFTRAST_VALIDATE_FB;
908 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
910 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
913 command->width = width;
914 command->height = height;
916 dpsoftrast.viewport[0] = x;
917 dpsoftrast.viewport[1] = y;
918 dpsoftrast.viewport[2] = width;
919 dpsoftrast.viewport[3] = height;
920 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
923 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
924 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
926 int i, x1, y1, x2, y2, w, h, x, y;
927 int miny1, maxy1, miny2, maxy2;
931 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
932 miny1 = thread->miny1;
933 maxy1 = thread->maxy1;
934 miny2 = thread->miny2;
935 maxy2 = thread->maxy2;
936 x1 = thread->fb_scissor[0];
937 y1 = thread->fb_scissor[1];
938 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
939 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
940 if (y1 < miny1) y1 = miny1;
941 if (y2 > maxy2) y2 = maxy2;
946 // FIXME: honor fb_colormask?
947 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
948 for (i = 0;i < 4;i++)
950 if (!dpsoftrast.fb_colorpixels[i])
952 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
955 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
956 for (x = x1;x < x2;x++)
961 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
963 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
970 DEFCOMMAND(3, ClearDepth, float depth;)
971 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
973 int x1, y1, x2, y2, w, h, x, y;
974 int miny1, maxy1, miny2, maxy2;
978 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
979 miny1 = thread->miny1;
980 maxy1 = thread->maxy1;
981 miny2 = thread->miny2;
982 maxy2 = thread->maxy2;
983 x1 = thread->fb_scissor[0];
984 y1 = thread->fb_scissor[1];
985 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
986 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
987 if (y1 < miny1) y1 = miny1;
988 if (y2 > maxy2) y2 = maxy2;
993 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
994 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
997 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
998 for (x = x1;x < x2;x++)
1002 void DPSOFTRAST_ClearDepth(float d)
1004 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1008 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1009 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1011 thread->colormask[0] = command->r != 0;
1012 thread->colormask[1] = command->g != 0;
1013 thread->colormask[2] = command->b != 0;
1014 thread->colormask[3] = command->a != 0;
1015 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1017 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1019 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1026 DEFCOMMAND(5, DepthTest, int enable;)
1027 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1029 thread->depthtest = command->enable;
1030 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1032 void DPSOFTRAST_DepthTest(int enable)
1034 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1035 command->enable = enable;
1038 DEFCOMMAND(6, ScissorTest, int enable;)
1039 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1041 thread->scissortest = command->enable;
1042 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1044 void DPSOFTRAST_ScissorTest(int enable)
1046 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1047 command->enable = enable;
1050 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1051 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1053 thread->scissor[0] = command->x;
1054 thread->scissor[1] = command->y;
1055 thread->scissor[2] = command->width;
1056 thread->scissor[3] = command->height;
1057 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1059 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1061 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1064 command->width = width;
1065 command->height = height;
1068 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1069 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1071 thread->blendfunc[0] = command->sfactor;
1072 thread->blendfunc[1] = command->dfactor;
1073 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1075 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1077 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1078 command->sfactor = sfactor;
1079 command->dfactor = dfactor;
1082 DEFCOMMAND(9, BlendSubtract, int enable;)
1083 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1085 thread->blendsubtract = command->enable;
1086 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1088 void DPSOFTRAST_BlendSubtract(int enable)
1090 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1091 command->enable = enable;
1094 DEFCOMMAND(10, DepthMask, int enable;)
1095 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1097 thread->depthmask = command->enable;
1099 void DPSOFTRAST_DepthMask(int enable)
1101 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1102 command->enable = enable;
1105 DEFCOMMAND(11, DepthFunc, int func;)
1106 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1108 thread->depthfunc = command->func;
1110 void DPSOFTRAST_DepthFunc(int func)
1112 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1113 command->func = func;
1116 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1117 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1119 thread->depthrange[0] = command->nearval;
1120 thread->depthrange[1] = command->farval;
1122 void DPSOFTRAST_DepthRange(float nearval, float farval)
1124 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1125 command->nearval = nearval;
1126 command->farval = farval;
1129 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1130 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1132 thread->polygonoffset[0] = command->alongnormal;
1133 thread->polygonoffset[1] = command->intoview;
1135 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1137 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1138 command->alongnormal = alongnormal;
1139 command->intoview = intoview;
1142 DEFCOMMAND(14, CullFace, int mode;)
1143 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1145 thread->cullface = command->mode;
1147 void DPSOFTRAST_CullFace(int mode)
1149 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1150 command->mode = mode;
1153 DEFCOMMAND(15, AlphaTest, int enable;)
1154 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1156 thread->alphatest = command->enable;
1158 void DPSOFTRAST_AlphaTest(int enable)
1160 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1161 command->enable = enable;
1164 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1165 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1167 thread->alphafunc = command->func;
1168 thread->alphavalue = command->ref;
1170 void DPSOFTRAST_AlphaFunc(int func, float ref)
1172 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1173 command->func = func;
1177 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1179 dpsoftrast.color[0] = r;
1180 dpsoftrast.color[1] = g;
1181 dpsoftrast.color[2] = b;
1182 dpsoftrast.color[3] = a;
1185 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1187 int outstride = blockwidth * 4;
1188 int instride = dpsoftrast.fb_width * 4;
1191 int bx2 = blockx + blockwidth;
1192 int by2 = blocky + blockheight;
1196 unsigned char *inpixels;
1200 if (bx1 < 0) bx1 = 0;
1201 if (by1 < 0) by1 = 0;
1202 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1203 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1205 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1206 if (dpsoftrast.bigendian)
1208 for (y = by1;y < by2;y++)
1210 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1211 o = (unsigned char *)outpixels + (y - by1) * outstride;
1212 for (x = bx1;x < bx2;x++)
1225 for (y = by1;y < by2;y++)
1227 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1228 o = (unsigned char *)outpixels + (y - by1) * outstride;
1234 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1238 int tx2 = tx + width;
1239 int ty2 = ty + height;
1242 int sx2 = sx + width;
1243 int sy2 = sy + height;
1253 unsigned int *spixels;
1254 unsigned int *tpixels;
1255 DPSOFTRAST_Texture *texture;
1256 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1257 if (mip < 0 || mip >= texture->mipmaps) return;
1259 spixels = dpsoftrast.fb_colorpixels[0];
1260 swidth = dpsoftrast.fb_width;
1261 sheight = dpsoftrast.fb_height;
1262 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1263 twidth = texture->mipmap[mip][2];
1264 theight = texture->mipmap[mip][3];
1265 if (tx1 < 0) tx1 = 0;
1266 if (ty1 < 0) ty1 = 0;
1267 if (tx2 > twidth) tx2 = twidth;
1268 if (ty2 > theight) ty2 = theight;
1269 if (sx1 < 0) sx1 = 0;
1270 if (sy1 < 0) sy1 = 0;
1271 if (sx2 > swidth) sx2 = swidth;
1272 if (sy2 > sheight) sy2 = sheight;
1277 if (tw > sw) tw = sw;
1278 if (th > sh) th = sh;
1279 if (tw < 1 || th < 1)
1281 sy1 = sheight - 1 - sy1;
1282 for (y = 0;y < th;y++)
1283 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1284 if (texture->mipmaps > 1)
1285 DPSOFTRAST_Texture_CalculateMipmaps(index);
1288 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1289 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1291 if (thread->texbound[command->unitnum])
1292 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1293 thread->texbound[command->unitnum] = command->texture;
1295 void DPSOFTRAST_SetTexture(int unitnum, int index)
1297 DPSOFTRAST_Command_SetTexture *command;
1298 DPSOFTRAST_Texture *texture;
1299 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1301 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1304 texture = DPSOFTRAST_Texture_GetByIndex(index);
1305 if (index && !texture)
1307 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1311 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1312 command->unitnum = unitnum;
1313 command->texture = texture;
1315 dpsoftrast.texbound[unitnum] = texture;
1316 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1319 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1321 dpsoftrast.pointer_vertex3f = vertex3f;
1322 dpsoftrast.stride_vertex = stride;
1324 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1326 dpsoftrast.pointer_color4f = color4f;
1327 dpsoftrast.pointer_color4ub = NULL;
1328 dpsoftrast.stride_color = stride;
1330 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1332 dpsoftrast.pointer_color4f = NULL;
1333 dpsoftrast.pointer_color4ub = color4ub;
1334 dpsoftrast.stride_color = stride;
1336 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1338 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1339 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1340 dpsoftrast.stride_texcoord[unitnum] = stride;
1343 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1344 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1346 thread->shader_mode = command->mode;
1347 thread->shader_permutation = command->permutation;
1348 thread->shader_exactspecularmath = command->exactspecularmath;
1350 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1352 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1353 command->mode = mode;
1354 command->permutation = permutation;
1355 command->exactspecularmath = exactspecularmath;
1357 dpsoftrast.shader_mode = mode;
1358 dpsoftrast.shader_permutation = permutation;
1359 dpsoftrast.shader_exactspecularmath = exactspecularmath;
1362 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1363 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1365 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1367 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1369 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1370 command->index = index;
1371 command->val[0] = v0;
1372 command->val[1] = v1;
1373 command->val[2] = v2;
1374 command->val[3] = v3;
1376 dpsoftrast.uniform4f[index*4+0] = v0;
1377 dpsoftrast.uniform4f[index*4+1] = v1;
1378 dpsoftrast.uniform4f[index*4+2] = v2;
1379 dpsoftrast.uniform4f[index*4+3] = v3;
1381 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1383 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1384 command->index = index;
1385 memcpy(command->val, v, sizeof(command->val));
1387 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1390 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1391 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1393 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1395 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1399 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1401 __m128 m0, m1, m2, m3;
1402 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1403 command->index = (DPSOFTRAST_UNIFORM)index;
1404 if (((size_t)v)&(ALIGN_SIZE-1))
1406 m0 = _mm_loadu_ps(v);
1407 m1 = _mm_loadu_ps(v+4);
1408 m2 = _mm_loadu_ps(v+8);
1409 m3 = _mm_loadu_ps(v+12);
1413 m0 = _mm_load_ps(v);
1414 m1 = _mm_load_ps(v+4);
1415 m2 = _mm_load_ps(v+8);
1416 m3 = _mm_load_ps(v+12);
1420 __m128 t0, t1, t2, t3;
1421 t0 = _mm_unpacklo_ps(m0, m1);
1422 t1 = _mm_unpacklo_ps(m2, m3);
1423 t2 = _mm_unpackhi_ps(m0, m1);
1424 t3 = _mm_unpackhi_ps(m2, m3);
1425 m0 = _mm_movelh_ps(t0, t1);
1426 m1 = _mm_movehl_ps(t1, t0);
1427 m2 = _mm_movelh_ps(t2, t3);
1428 m3 = _mm_movehl_ps(t3, t2);
1430 _mm_store_ps(command->val, m0);
1431 _mm_store_ps(command->val+4, m1);
1432 _mm_store_ps(command->val+8, m2);
1433 _mm_store_ps(command->val+12, m3);
1434 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1435 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1436 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1437 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1442 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1443 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1445 thread->uniform1i[command->index] = command->val;
1447 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1449 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1450 command->index = index;
1453 dpsoftrast.uniform1i[command->index] = i0;
1456 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1457 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1459 memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1460 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1462 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1464 DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1465 command->clipplane[0] = x;
1466 command->clipplane[1] = y;
1467 command->clipplane[2] = z;
1468 command->clipplane[3] = w;
1472 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1474 float *end = dst + size*4;
1475 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1479 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1488 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1495 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1497 float *end = dst + size*4;
1498 if (stride == sizeof(float[3]))
1500 float *end4 = dst + (size&~3)*4;
1501 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1505 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1506 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1507 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1508 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1509 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1510 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1511 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1512 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1513 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1514 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1515 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1516 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1517 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1519 src += 4*sizeof(float[3]);
1526 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1527 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1528 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1529 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1530 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1531 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1532 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1533 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1534 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1535 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1536 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1537 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1538 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1540 src += 4*sizeof(float[3]);
1544 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1548 __m128 v = _mm_loadu_ps((const float *)src);
1549 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1550 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1551 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1552 _mm_store_ps(dst, v);
1561 __m128 v = _mm_load_ps((const float *)src);
1562 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1563 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1564 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1565 _mm_store_ps(dst, v);
1572 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1574 float *end = dst + size*4;
1575 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1576 if (stride == sizeof(float[2]))
1578 float *end2 = dst + (size&~1)*4;
1579 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1583 __m128 v = _mm_loadu_ps((const float *)src);
1584 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1585 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1587 src += 2*sizeof(float[2]);
1594 __m128 v = _mm_load_ps((const float *)src);
1595 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1596 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1598 src += 2*sizeof(float[2]);
1604 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1610 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1612 float *end = dst + size*4;
1613 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1614 if (stride == sizeof(unsigned char[4]))
1616 float *end4 = dst + (size&~3)*4;
1617 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1621 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1622 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1623 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1624 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1625 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1627 src += 4*sizeof(unsigned char[4]);
1634 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1635 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1636 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1637 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1638 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1640 src += 4*sizeof(unsigned char[4]);
1646 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1647 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1653 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1655 float *end = dst + 4*size;
1656 __m128 v = _mm_loadu_ps(src);
1659 _mm_store_ps(dst, v);
1665 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1668 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1669 __m128 m0, m1, m2, m3;
1671 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1673 // fast case for identity matrix
1674 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1677 end = out4f + numitems*4;
1678 m0 = _mm_loadu_ps(inmatrix16f);
1679 m1 = _mm_loadu_ps(inmatrix16f + 4);
1680 m2 = _mm_loadu_ps(inmatrix16f + 8);
1681 m3 = _mm_loadu_ps(inmatrix16f + 12);
1682 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1686 __m128 v = _mm_loadu_ps(in4f);
1688 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1689 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1690 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1691 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1700 __m128 v = _mm_load_ps(in4f);
1702 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1703 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1704 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1705 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1713 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1715 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1719 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1721 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1722 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1723 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1724 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1727 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1729 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1730 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1731 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1732 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1735 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1738 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1739 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1740 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1741 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1744 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1746 int clipmask = 0xFF;
1747 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1748 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1749 __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1750 __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1751 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1752 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1753 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1754 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1755 #define BBFRONT(k, pos) \
1757 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1758 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1759 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1762 clipmask &= ~(1<<k); \
1763 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1764 minproj = _mm_min_ss(minproj, proj); \
1765 maxproj = _mm_max_ss(maxproj, proj); \
1769 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1770 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1771 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1772 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1773 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1774 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1778 if (clipmask&(1<<k)) \
1780 if (!(clipmask&(1<<(k^1)))) \
1782 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1783 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1784 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1785 minproj = _mm_min_ss(minproj, proj); \
1786 maxproj = _mm_max_ss(maxproj, proj); \
1788 if (!(clipmask&(1<<(k^2)))) \
1790 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1791 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1792 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1793 minproj = _mm_min_ss(minproj, proj); \
1794 maxproj = _mm_max_ss(maxproj, proj); \
1796 if (!(clipmask&(1<<(k^4)))) \
1798 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1799 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1800 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1801 minproj = _mm_min_ss(minproj, proj); \
1802 maxproj = _mm_max_ss(maxproj, proj); \
1806 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1807 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1808 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1809 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1810 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1811 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1812 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1813 *starty = _mm_cvttss_si32(maxproj);
1814 *endy = _mm_cvttss_si32(minproj)+1;
1818 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1820 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1821 float *end = out4f + numitems*4;
1822 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1823 __m128 minpos, maxpos;
1824 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1826 minpos = maxpos = _mm_loadu_ps(in4f);
1829 __m128 v = _mm_loadu_ps(in4f);
1830 minpos = _mm_min_ps(minpos, v);
1831 maxpos = _mm_max_ps(maxpos, v);
1832 _mm_store_ps(out4f, v);
1833 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1834 _mm_store_ps(screen4f, v);
1842 minpos = maxpos = _mm_load_ps(in4f);
1845 __m128 v = _mm_load_ps(in4f);
1846 minpos = _mm_min_ps(minpos, v);
1847 maxpos = _mm_max_ps(maxpos, v);
1848 _mm_store_ps(out4f, v);
1849 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1850 _mm_store_ps(screen4f, v);
1858 ALIGN(float minposf[4]);
1859 ALIGN(float maxposf[4]);
1860 _mm_store_ps(minposf, minpos);
1861 _mm_store_ps(maxposf, maxpos);
1862 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1867 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1869 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1870 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1872 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1873 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1874 end = out4f + numitems*4;
1875 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1876 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1877 m0 = _mm_loadu_ps(inmatrix16f);
1878 m1 = _mm_loadu_ps(inmatrix16f + 4);
1879 m2 = _mm_loadu_ps(inmatrix16f + 8);
1880 m3 = _mm_loadu_ps(inmatrix16f + 12);
1881 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1883 minpos = maxpos = _mm_loadu_ps(in4f);
1886 __m128 v = _mm_loadu_ps(in4f);
1887 minpos = _mm_min_ps(minpos, v);
1888 maxpos = _mm_max_ps(maxpos, v);
1889 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1890 _mm_store_ps(out4f, v);
1891 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1892 _mm_store_ps(screen4f, v);
1900 minpos = maxpos = _mm_load_ps(in4f);
1903 __m128 v = _mm_load_ps(in4f);
1904 minpos = _mm_min_ps(minpos, v);
1905 maxpos = _mm_max_ps(maxpos, v);
1906 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1907 _mm_store_ps(out4f, v);
1908 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1909 _mm_store_ps(screen4f, v);
1917 ALIGN(float minposf[4]);
1918 ALIGN(float maxposf[4]);
1919 _mm_store_ps(minposf, minpos);
1920 _mm_store_ps(maxposf, maxpos);
1921 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f);
1927 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1930 float *outf = dpsoftrast.post_array4f[outarray];
1931 const unsigned char *inb;
1932 int firstvertex = dpsoftrast.firstvertex;
1933 int numvertices = dpsoftrast.numvertices;
1937 case DPSOFTRAST_ARRAY_POSITION:
1938 stride = dpsoftrast.stride_vertex;
1939 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1940 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1942 case DPSOFTRAST_ARRAY_COLOR:
1943 stride = dpsoftrast.stride_color;
1944 if (dpsoftrast.pointer_color4f)
1946 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1947 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1949 else if (dpsoftrast.pointer_color4ub)
1951 stride = dpsoftrast.stride_color;
1952 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1953 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1957 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1961 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1962 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1964 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1965 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1968 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1971 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1974 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1986 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1988 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1989 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1994 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1997 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1998 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
2006 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2009 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2010 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2017 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2020 int startx = span->startx;
2021 int endx = span->endx;
2022 float wslope = triangle->w[0];
2023 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2024 float endz = 1.0f / (w + wslope * startx);
2025 if (triangle->w[0] == 0)
2027 // LordHavoc: fast flat polygons (HUD/menu)
2028 for (x = startx;x < endx;x++)
2032 for (x = startx;x < endx;)
2034 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2036 if (nextsub >= endx) nextsub = endsub = endx-1;
2037 endz = 1.0f / (w + wslope * nextsub);
2038 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2039 for (; x <= endsub; x++, z += dz)
2044 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2048 int startx = span->startx;
2049 int endx = span->endx;
2051 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2052 unsigned char * RESTRICT pixelmask = span->pixelmask;
2053 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2054 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2057 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2058 pixeli += span->y * dpsoftrast.fb_width + span->x;
2059 // handle alphatest now (this affects depth writes too)
2060 if (thread->alphatest)
2061 for (x = startx;x < endx;x++)
2062 if (in4ub[x*4+3] < 128)
2063 pixelmask[x] = false;
2064 // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2065 // helps sprites, text and hud artwork
2066 switch(thread->fb_blendmode)
2068 case DPSOFTRAST_BLENDMODE_ALPHA:
2069 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2070 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2071 for (x = startx;x < endx;x++)
2072 if (in4ub[x*4+3] < 1)
2073 pixelmask[x] = false;
2075 case DPSOFTRAST_BLENDMODE_OPAQUE:
2076 case DPSOFTRAST_BLENDMODE_ADD:
2077 case DPSOFTRAST_BLENDMODE_INVMOD:
2078 case DPSOFTRAST_BLENDMODE_MUL:
2079 case DPSOFTRAST_BLENDMODE_MUL2:
2080 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2081 case DPSOFTRAST_BLENDMODE_INVADD:
2084 // put some special values at the end of the mask to ensure the loops end
2085 pixelmask[endx] = 1;
2086 pixelmask[endx+1] = 0;
2087 // LordHavoc: use a double loop to identify subspans, this helps the
2088 // optimized copy/blend loops to perform at their best, most triangles
2089 // have only one run of pixels, and do the search using wide reads...
2093 // if this pixel is masked off, it's probably not alone...
2100 // the 4-item search must be aligned or else it stalls badly
2101 if ((x & 3) && !pixelmask[x]) x++;
2102 if ((x & 3) && !pixelmask[x]) x++;
2103 if ((x & 3) && !pixelmask[x]) x++;
2104 while (*((unsigned int *)pixelmask + x) == 0x00000000)
2108 for (;!pixelmask[x];x++)
2110 // rather than continue the loop, just check the end variable
2114 // find length of subspan
2119 if ((subx & 3) && pixelmask[subx]) subx++;
2120 if ((subx & 3) && pixelmask[subx]) subx++;
2121 if ((subx & 3) && pixelmask[subx]) subx++;
2122 while (*((unsigned int *)pixelmask + subx) == 0x01010101)
2126 for (;pixelmask[subx];subx++)
2128 // the checks can overshoot, so make sure to clip it...
2131 // now that we know the subspan length... process!
2132 switch(thread->fb_blendmode)
2134 case DPSOFTRAST_BLENDMODE_OPAQUE:
2138 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2143 while (x + 16 <= subx)
2145 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2146 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2147 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2148 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2153 while (x + 4 <= subx)
2155 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2161 pixeli[x+1] = ini[x+1];
2171 case DPSOFTRAST_BLENDMODE_ALPHA:
2172 #define FINISHBLEND(blend2, blend1) \
2173 for (;x + 1 < subx;x += 2) \
2176 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2177 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2179 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2184 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2185 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2187 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2191 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2192 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2194 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2195 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2198 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2200 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2201 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2203 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2204 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2207 case DPSOFTRAST_BLENDMODE_ADD:
2208 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2210 case DPSOFTRAST_BLENDMODE_INVMOD:
2212 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2214 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2217 case DPSOFTRAST_BLENDMODE_MUL:
2218 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2220 case DPSOFTRAST_BLENDMODE_MUL2:
2221 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2223 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2225 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2226 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2228 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2229 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2232 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2234 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2235 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2237 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2238 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2241 case DPSOFTRAST_BLENDMODE_INVADD:
2243 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2245 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2253 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2256 int startx = span->startx;
2257 int endx = span->endx;
2262 float tc[2], endtc[2];
2264 unsigned int tci[2];
2265 unsigned int tci1[2];
2266 unsigned int tcimin[2];
2267 unsigned int tcimax[2];
2272 const unsigned char * RESTRICT pixelbase;
2273 const unsigned char * RESTRICT pixel[4];
2274 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2275 // if no texture is bound, just fill it with white
2278 for (x = startx;x < endx;x++)
2280 out4f[x*4+0] = 1.0f;
2281 out4f[x*4+1] = 1.0f;
2282 out4f[x*4+2] = 1.0f;
2283 out4f[x*4+3] = 1.0f;
2287 mip = triangle->mip[texunitindex];
2288 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2289 // if this mipmap of the texture is 1 pixel, just fill it with that color
2290 if (texture->mipmap[mip][1] == 4)
2292 c[0] = texture->bytes[2] * (1.0f/255.0f);
2293 c[1] = texture->bytes[1] * (1.0f/255.0f);
2294 c[2] = texture->bytes[0] * (1.0f/255.0f);
2295 c[3] = texture->bytes[3] * (1.0f/255.0f);
2296 for (x = startx;x < endx;x++)
2298 out4f[x*4+0] = c[0];
2299 out4f[x*4+1] = c[1];
2300 out4f[x*4+2] = c[2];
2301 out4f[x*4+3] = c[3];
2305 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2306 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2307 flags = texture->flags;
2308 tcscale[0] = texture->mipmap[mip][2];
2309 tcscale[1] = texture->mipmap[mip][3];
2310 tciwidth = texture->mipmap[mip][2];
2313 tcimax[0] = texture->mipmap[mip][2]-1;
2314 tcimax[1] = texture->mipmap[mip][3]-1;
2315 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2316 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2317 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2318 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2324 for (x = startx;x < endx;)
2326 unsigned int subtc[2];
2327 unsigned int substep[2];
2328 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2329 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2330 if (nextsub >= endx)
2332 nextsub = endsub = endx-1;
2333 if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2337 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2338 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2344 substep[0] = (endtc[0] - tc[0]) * subscale;
2345 substep[1] = (endtc[1] - tc[1]) * subscale;
2346 subtc[0] = tc[0] * (1<<12);
2347 subtc[1] = tc[1] * (1<<12);
2350 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2352 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2354 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2355 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2356 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2357 tci[0] = subtc[0]>>12;
2358 tci[1] = subtc[1]>>12;
2359 tci1[0] = tci[0] + 1;
2360 tci1[1] = tci[1] + 1;
2361 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2362 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2363 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2364 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2365 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2366 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2367 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2368 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2369 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2370 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2371 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2372 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2373 out4f[x*4+0] = c[0];
2374 out4f[x*4+1] = c[1];
2375 out4f[x*4+2] = c[2];
2376 out4f[x*4+3] = c[3];
2381 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2383 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2384 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2385 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2386 tci[0] = subtc[0]>>12;
2387 tci[1] = subtc[1]>>12;
2388 tci1[0] = tci[0] + 1;
2389 tci1[1] = tci[1] + 1;
2390 tci[0] &= tciwrapmask[0];
2391 tci[1] &= tciwrapmask[1];
2392 tci1[0] &= tciwrapmask[0];
2393 tci1[1] &= tciwrapmask[1];
2394 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2395 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2396 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2397 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2398 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2399 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2400 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2401 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2402 out4f[x*4+0] = c[0];
2403 out4f[x*4+1] = c[1];
2404 out4f[x*4+2] = c[2];
2405 out4f[x*4+3] = c[3];
2409 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2411 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2413 tci[0] = subtc[0]>>12;
2414 tci[1] = subtc[1]>>12;
2415 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2416 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2417 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2418 c[0] = pixel[0][2] * (1.0f / 255.0f);
2419 c[1] = pixel[0][1] * (1.0f / 255.0f);
2420 c[2] = pixel[0][0] * (1.0f / 255.0f);
2421 c[3] = pixel[0][3] * (1.0f / 255.0f);
2422 out4f[x*4+0] = c[0];
2423 out4f[x*4+1] = c[1];
2424 out4f[x*4+2] = c[2];
2425 out4f[x*4+3] = c[3];
2430 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2432 tci[0] = subtc[0]>>12;
2433 tci[1] = subtc[1]>>12;
2434 tci[0] &= tciwrapmask[0];
2435 tci[1] &= tciwrapmask[1];
2436 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2437 c[0] = pixel[0][2] * (1.0f / 255.0f);
2438 c[1] = pixel[0][1] * (1.0f / 255.0f);
2439 c[2] = pixel[0][0] * (1.0f / 255.0f);
2440 c[3] = pixel[0][3] * (1.0f / 255.0f);
2441 out4f[x*4+0] = c[0];
2442 out4f[x*4+1] = c[1];
2443 out4f[x*4+2] = c[2];
2444 out4f[x*4+3] = c[3];
2450 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2454 int startx = span->startx;
2455 int endx = span->endx;
2457 __m128 data, slope, tcscale;
2458 __m128i tcsize, tcmask, tcoffset, tcmax;
2460 __m128i subtc, substep, endsubtc;
2463 int affine; // LordHavoc: optimized affine texturing case
2464 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2465 const unsigned char * RESTRICT pixelbase;
2466 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2467 // if no texture is bound, just fill it with white
2470 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2473 mip = triangle->mip[texunitindex];
2474 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2475 // if this mipmap of the texture is 1 pixel, just fill it with that color
2476 if (texture->mipmap[mip][1] == 4)
2478 unsigned int k = *((const unsigned int *)pixelbase);
2479 for (x = startx;x < endx;x++)
2483 affine = zf[startx] == zf[endx-1];
2484 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2485 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2486 flags = texture->flags;
2487 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2488 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2489 tcscale = _mm_cvtepi32_ps(tcsize);
2490 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2491 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2492 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2494 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2495 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2496 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2497 tcmax = _mm_packs_epi32(tcmask, tcmask);
2498 for (x = startx;x < endx;)
2500 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2501 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2502 if (nextsub >= endx || affine)
2504 nextsub = endsub = endx-1;
2505 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2509 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2511 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2512 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2513 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2514 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2515 substep = _mm_slli_epi32(substep, 1);
2518 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2519 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2521 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2522 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2524 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2525 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2526 tci = _mm_madd_epi16(tci, tcoffset);
2527 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2528 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2529 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2530 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2531 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2532 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2533 fracm = _mm_srli_epi16(subtc, 1);
2534 pix1 = _mm_add_epi16(pix1,
2535 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2536 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2537 pix3 = _mm_add_epi16(pix3,
2538 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2539 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2540 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2541 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2542 pix2 = _mm_add_epi16(pix2,
2543 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2544 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2545 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2549 const unsigned char * RESTRICT ptr1;
2550 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2551 tci = _mm_madd_epi16(tci, tcoffset);
2552 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2553 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2554 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2555 fracm = _mm_srli_epi16(subtc, 1);
2556 pix1 = _mm_add_epi16(pix1,
2557 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2558 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2559 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2560 pix1 = _mm_add_epi16(pix1,
2561 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2562 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2563 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2567 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2569 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2571 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2572 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2573 tci = _mm_madd_epi16(tci, tcoffset);
2574 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2575 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2576 _mm_setzero_si128());
2577 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2578 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2579 _mm_setzero_si128());
2580 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2581 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2582 tci = _mm_madd_epi16(tci, tcoffset);
2583 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2584 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2585 _mm_setzero_si128());
2586 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2587 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2588 _mm_setzero_si128());
2589 fracm = _mm_srli_epi16(subtc, 1);
2590 pix1 = _mm_add_epi16(pix1,
2591 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2592 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2593 pix3 = _mm_add_epi16(pix3,
2594 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2595 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2596 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2597 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2598 pix2 = _mm_add_epi16(pix2,
2599 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2600 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2601 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2605 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2606 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2607 tci = _mm_madd_epi16(tci, tcoffset);
2608 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2609 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2610 _mm_setzero_si128());
2611 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2612 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2613 _mm_setzero_si128());
2614 fracm = _mm_srli_epi16(subtc, 1);
2615 pix1 = _mm_add_epi16(pix1,
2616 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2617 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2618 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2619 pix1 = _mm_add_epi16(pix1,
2620 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2621 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2622 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2628 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2630 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2631 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2632 tci = _mm_madd_epi16(tci, tcoffset);
2633 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2634 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2635 _mm_setzero_si128());
2636 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2637 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2638 _mm_setzero_si128());
2639 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2640 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2641 tci = _mm_madd_epi16(tci, tcoffset);
2642 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2643 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2644 _mm_setzero_si128());
2645 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2646 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2647 _mm_setzero_si128());
2648 fracm = _mm_srli_epi16(subtc, 1);
2649 pix1 = _mm_add_epi16(pix1,
2650 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2651 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2652 pix3 = _mm_add_epi16(pix3,
2653 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2654 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2655 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2656 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2657 pix2 = _mm_add_epi16(pix2,
2658 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2659 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2660 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2664 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2665 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2666 tci = _mm_madd_epi16(tci, tcoffset);
2667 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2668 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2669 _mm_setzero_si128());
2670 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2671 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2672 _mm_setzero_si128());
2673 fracm = _mm_srli_epi16(subtc, 1);
2674 pix1 = _mm_add_epi16(pix1,
2675 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2676 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2677 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2678 pix1 = _mm_add_epi16(pix1,
2679 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2680 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2681 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2688 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2690 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2692 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2693 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2694 tci = _mm_madd_epi16(tci, tcoffset);
2695 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2696 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2700 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2701 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2702 tci = _mm_madd_epi16(tci, tcoffset);
2703 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2709 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2711 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2712 tci = _mm_and_si128(tci, tcmax);
2713 tci = _mm_madd_epi16(tci, tcoffset);
2714 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2715 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2719 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2720 tci = _mm_and_si128(tci, tcmax);
2721 tci = _mm_madd_epi16(tci, tcoffset);
2722 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2731 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2734 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2737 float DPSOFTRAST_SampleShadowmap(const float *vector)
2743 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2746 int startx = span->startx;
2747 int endx = span->endx;
2752 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2753 for (x = startx;x < endx;x++)
2756 c[0] = (data[0] + slope[0]*x) * z;
2757 c[1] = (data[1] + slope[1]*x) * z;
2758 c[2] = (data[2] + slope[2]*x) * z;
2759 c[3] = (data[3] + slope[3]*x) * z;
2760 out4f[x*4+0] = in4f[x*4+0] * c[0];
2761 out4f[x*4+1] = in4f[x*4+1] * c[1];
2762 out4f[x*4+2] = in4f[x*4+2] * c[2];
2763 out4f[x*4+3] = in4f[x*4+3] * c[3];
2767 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2770 int startx = span->startx;
2771 int endx = span->endx;
2776 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2777 for (x = startx;x < endx;x++)
2780 c[0] = (data[0] + slope[0]*x) * z;
2781 c[1] = (data[1] + slope[1]*x) * z;
2782 c[2] = (data[2] + slope[2]*x) * z;
2783 c[3] = (data[3] + slope[3]*x) * z;
2784 out4f[x*4+0] = c[0];
2785 out4f[x*4+1] = c[1];
2786 out4f[x*4+2] = c[2];
2787 out4f[x*4+3] = c[3];
2791 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2793 int x, startx = span->startx, endx = span->endx;
2794 float c[4], localcolor[4];
2795 localcolor[0] = subcolor[0];
2796 localcolor[1] = subcolor[1];
2797 localcolor[2] = subcolor[2];
2798 localcolor[3] = subcolor[3];
2799 for (x = startx;x < endx;x++)
2801 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2802 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2803 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2804 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2805 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2806 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2807 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2808 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2812 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2814 int x, startx = span->startx, endx = span->endx;
2815 for (x = startx;x < endx;x++)
2817 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2818 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2819 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2820 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2824 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2826 int x, startx = span->startx, endx = span->endx;
2827 for (x = startx;x < endx;x++)
2829 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2830 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2831 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2832 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2836 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2838 int x, startx = span->startx, endx = span->endx;
2840 for (x = startx;x < endx;x++)
2842 a = 1.0f - inb4f[x*4+3];
2844 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2845 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2846 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2847 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2851 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2853 int x, startx = span->startx, endx = span->endx;
2854 float localcolor[4], ilerp, lerp;
2855 localcolor[0] = color[0];
2856 localcolor[1] = color[1];
2857 localcolor[2] = color[2];
2858 localcolor[3] = color[3];
2859 ilerp = 1.0f - localcolor[3];
2860 lerp = localcolor[3];
2861 for (x = startx;x < endx;x++)
2863 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2864 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2865 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2866 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2872 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2876 int startx = span->startx;
2877 int endx = span->endx;
2880 __m128i submod, substep, endsubmod;
2881 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2882 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2883 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2884 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2885 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2886 for (x = startx; x < endx;)
2888 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2889 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2890 if (nextsub >= endx)
2892 nextsub = endsub = endx-1;
2893 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2897 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2898 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2899 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2900 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2901 substep = _mm_packs_epi32(substep, substep);
2902 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2904 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2905 pix = _mm_mulhi_epu16(pix, submod);
2906 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2910 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2911 pix = _mm_mulhi_epu16(pix, submod);
2912 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2919 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2923 int startx = span->startx;
2924 int endx = span->endx;
2927 __m128i submod, substep, endsubmod;
2928 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2929 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2930 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2931 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2932 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2933 for (x = startx; x < endx;)
2935 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2936 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2937 if (nextsub >= endx)
2939 nextsub = endsub = endx-1;
2940 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2944 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2945 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2946 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2947 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2948 substep = _mm_packs_epi32(substep, substep);
2949 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2951 __m128i pix = _mm_srai_epi16(submod, 4);
2952 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2956 __m128i pix = _mm_srai_epi16(submod, 4);
2957 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2964 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2967 int x, startx = span->startx, endx = span->endx;
2968 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2969 localcolor = _mm_packs_epi32(localcolor, localcolor);
2970 for (x = startx;x+2 <= endx;x+=2)
2972 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2973 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2974 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
2975 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2979 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2980 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2981 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
2982 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2987 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2990 int x, startx = span->startx, endx = span->endx;
2991 for (x = startx;x+2 <= endx;x+=2)
2993 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2994 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
2995 pix1 = _mm_mulhi_epu16(pix1, pix2);
2996 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3000 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3001 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3002 pix1 = _mm_mulhi_epu16(pix1, pix2);
3003 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3008 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3011 int x, startx = span->startx, endx = span->endx;
3012 for (x = startx;x+2 <= endx;x+=2)
3014 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3015 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3016 pix1 = _mm_add_epi16(pix1, pix2);
3017 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3021 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3022 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3023 pix1 = _mm_add_epi16(pix1, pix2);
3024 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3029 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3032 int x, startx = span->startx, endx = span->endx;
3033 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3034 tint = _mm_packs_epi32(tint, tint);
3035 for (x = startx;x+2 <= endx;x+=2)
3037 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3038 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3039 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3040 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3044 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3045 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3046 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3047 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3052 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3055 int x, startx = span->startx, endx = span->endx;
3056 for (x = startx;x+2 <= endx;x+=2)
3058 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3059 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3060 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3061 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3062 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3066 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3067 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3068 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3069 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3070 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3075 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3078 int x, startx = span->startx, endx = span->endx;
3079 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3080 localcolor = _mm_packs_epi32(localcolor, localcolor);
3081 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3082 for (x = startx;x+2 <= endx;x+=2)
3084 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3085 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3086 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3090 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3091 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3092 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3099 void DPSOFTRAST_VertexShader_Generic(void)
3101 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3102 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3103 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3104 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3105 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3108 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3110 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3111 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3112 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3113 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3114 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3115 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3117 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3118 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3119 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3121 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3122 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3125 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3127 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3130 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3132 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3135 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3140 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3141 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3146 void DPSOFTRAST_VertexShader_PostProcess(void)
3148 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3149 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3150 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3153 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3155 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3156 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3157 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3158 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3159 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3160 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3161 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3163 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3164 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3166 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3167 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3169 // TODO: implement saturation
3171 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3173 // TODO: implement gammaramps
3175 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3180 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3182 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3185 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3187 // this is never called (because colormask is off when this shader is used)
3188 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3189 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3190 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3191 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3192 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3197 void DPSOFTRAST_VertexShader_FlatColor(void)
3199 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3200 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3203 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3206 unsigned char * RESTRICT pixelmask = span->pixelmask;
3207 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3208 int x, startx = span->startx, endx = span->endx;
3209 __m128i Color_Ambientm;
3210 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3211 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3212 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3213 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3214 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3215 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3216 pixel = buffer_FragColorbgra8;
3217 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3218 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3219 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3220 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3221 for (x = startx;x < endx;x++)
3224 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3227 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3228 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3229 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3230 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3236 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3237 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3238 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3240 if (pixel == buffer_FragColorbgra8)
3241 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3247 void DPSOFTRAST_VertexShader_VertexColor(void)
3249 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3250 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3251 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3254 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3257 unsigned char * RESTRICT pixelmask = span->pixelmask;
3258 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3259 int x, startx = span->startx, endx = span->endx;
3260 __m128i Color_Ambientm, Color_Diffusem;
3262 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3263 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3264 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3265 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3266 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3267 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3268 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3269 pixel = buffer_FragColorbgra8;
3270 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3271 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3272 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3273 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3274 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3275 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3276 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3277 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3278 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3279 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3280 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3281 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3282 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3283 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3285 __m128i color, mod, pix;
3286 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3289 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3290 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3291 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3292 data = _mm_add_ps(data, slope);
3293 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3294 data = _mm_add_ps(data, slope);
3295 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3296 data = _mm_add_ps(data, slope);
3297 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3298 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3299 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3300 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3301 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3302 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3308 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3309 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3310 mod = _mm_packs_epi32(mod, mod);
3311 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3312 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3314 if (pixel == buffer_FragColorbgra8)
3315 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3321 void DPSOFTRAST_VertexShader_Lightmap(void)
3323 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3324 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3325 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3328 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3331 unsigned char * RESTRICT pixelmask = span->pixelmask;
3332 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3333 int x, startx = span->startx, endx = span->endx;
3334 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3335 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3336 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3337 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3338 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3339 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3340 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3341 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3342 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3343 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3344 pixel = buffer_FragColorbgra8;
3345 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3346 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3347 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3348 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3349 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3350 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3351 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3352 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3354 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3355 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3356 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3357 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3358 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3359 for (x = startx;x < endx;x++)
3361 __m128i color, lightmap, glow, pix;
3362 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3365 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3366 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3367 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3368 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3369 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3370 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3371 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3372 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3373 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3374 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3380 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3381 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3382 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3383 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3384 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3385 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3390 for (x = startx;x < endx;x++)
3392 __m128i color, lightmap, pix;
3393 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3396 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3397 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3398 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3399 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3400 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3401 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3402 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3408 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3409 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3410 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3411 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3414 if (pixel == buffer_FragColorbgra8)
3415 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3420 void DPSOFTRAST_VertexShader_LightDirection(void);
3421 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3423 void DPSOFTRAST_VertexShader_FakeLight(void)
3425 DPSOFTRAST_VertexShader_LightDirection();
3428 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3430 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3435 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3437 DPSOFTRAST_VertexShader_LightDirection();
3438 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3441 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3443 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3448 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3450 DPSOFTRAST_VertexShader_LightDirection();
3451 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3454 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3456 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3461 void DPSOFTRAST_VertexShader_LightDirection(void)
3464 int numvertices = dpsoftrast.numvertices;
3466 float LightVector[4];
3467 float EyePosition[4];
3468 float EyeVectorModelSpace[4];
3474 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3475 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3476 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3477 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3478 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3479 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3480 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3481 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3482 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3483 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3484 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3485 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3486 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3487 for (i = 0;i < numvertices;i++)
3489 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3490 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3491 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3492 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3493 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3494 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3495 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3496 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3497 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3498 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3499 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3500 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3501 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3502 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3503 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3504 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3505 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3506 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3507 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3508 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3509 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3510 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3511 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3512 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3513 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3514 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3515 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3516 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3517 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3519 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3522 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3523 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3524 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3525 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3526 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3527 #define DPSOFTRAST_Vector3Normalize(v)\
3530 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3541 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3543 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3544 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3545 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3546 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3547 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3548 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3549 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3550 unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3551 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3552 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3553 int x, startx = span->startx, endx = span->endx;
3554 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3555 float LightVectordata[4];
3556 float LightVectorslope[4];
3557 float EyeVectordata[4];
3558 float EyeVectorslope[4];
3559 float VectorSdata[4];
3560 float VectorSslope[4];
3561 float VectorTdata[4];
3562 float VectorTslope[4];
3563 float VectorRdata[4];
3564 float VectorRslope[4];
3566 float diffusetex[4];
3568 float surfacenormal[4];
3569 float lightnormal[4];
3570 float lightnormal_modelspace[4];
3572 float specularnormal[4];
3575 float SpecularPower;
3577 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3578 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3579 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3580 Color_Glow[3] = 0.0f;
3581 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3582 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3583 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3584 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3585 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3586 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3587 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3588 Color_Pants[3] = 0.0f;
3589 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3590 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3591 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3592 Color_Shirt[3] = 0.0f;
3593 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3594 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3595 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3597 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3598 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3600 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3602 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3604 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3606 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3607 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3608 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3609 Color_Diffuse[3] = 0.0f;
3610 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3611 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3612 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3613 LightColor[3] = 0.0f;
3614 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3615 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3616 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3617 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3618 Color_Specular[3] = 0.0f;
3619 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3620 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3621 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3623 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3625 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3626 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3627 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3628 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3629 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3631 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3633 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3634 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3636 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3638 // nothing of this needed
3642 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3645 for (x = startx;x < endx;x++)
3648 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3649 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3650 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3651 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3652 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3654 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3655 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3656 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3657 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3659 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3660 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3661 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3662 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3663 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3664 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3665 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3666 DPSOFTRAST_Vector3Normalize(surfacenormal);
3668 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3670 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3671 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3672 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3673 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3675 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3676 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3677 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3678 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3680 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3681 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3682 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3683 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3685 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3686 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3687 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3688 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3690 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3691 DPSOFTRAST_Vector3Normalize(lightnormal);
3693 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3695 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3696 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3697 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3698 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3701 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3703 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3704 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3705 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3707 float f = 1.0f / 256.0f;
3708 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3709 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3710 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3713 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3715 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3716 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3717 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3718 DPSOFTRAST_Vector3Normalize(lightnormal);
3720 LightColor[0] = 1.0;
3721 LightColor[1] = 1.0;
3722 LightColor[2] = 1.0;
3726 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3727 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3728 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3729 DPSOFTRAST_Vector3Normalize(lightnormal);
3732 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3734 if(thread->shader_exactspecularmath)
3736 // reflect lightnormal at surfacenormal, take the negative of that
3737 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3739 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3740 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3741 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3742 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3744 // dot of this and normalize(EyeVectorFogDepth.xyz)
3745 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3746 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3747 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3748 DPSOFTRAST_Vector3Normalize(eyenormal);
3750 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3754 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3755 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3756 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3757 DPSOFTRAST_Vector3Normalize(eyenormal);
3759 specularnormal[0] = lightnormal[0] + eyenormal[0];
3760 specularnormal[1] = lightnormal[1] + eyenormal[1];
3761 specularnormal[2] = lightnormal[2] + eyenormal[2];
3762 DPSOFTRAST_Vector3Normalize(specularnormal);
3764 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3767 specular = pow(specular, SpecularPower * glosstex[3]);
3768 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3770 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3771 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3772 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3773 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3777 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3778 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3779 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3780 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3783 buffer_FragColorbgra8[x*4+0] = d[0];
3784 buffer_FragColorbgra8[x*4+1] = d[1];
3785 buffer_FragColorbgra8[x*4+2] = d[2];
3786 buffer_FragColorbgra8[x*4+3] = d[3];
3789 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3791 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3792 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3793 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3794 Color_Diffuse[3] = 0.0f;
3795 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3796 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3797 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3798 LightColor[3] = 0.0f;
3799 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3801 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3803 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3804 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3805 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3806 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3807 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3809 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3811 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3812 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3814 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3816 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3820 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3823 for (x = startx;x < endx;x++)
3826 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3827 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3828 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3829 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3830 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3831 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3832 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3833 DPSOFTRAST_Vector3Normalize(surfacenormal);
3835 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3837 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3838 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3839 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3840 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3842 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3843 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3844 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3845 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3847 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3848 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3849 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3850 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3852 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3853 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3854 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3855 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3857 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3858 DPSOFTRAST_Vector3Normalize(lightnormal);
3860 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3862 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3863 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3864 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3865 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3868 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3870 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3871 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3872 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3874 float f = 1.0f / 256.0f;
3875 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3876 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3877 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3880 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3882 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3883 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3884 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3885 DPSOFTRAST_Vector3Normalize(lightnormal);
3887 LightColor[0] = 1.0;
3888 LightColor[1] = 1.0;
3889 LightColor[2] = 1.0;
3893 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3894 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3895 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3896 DPSOFTRAST_Vector3Normalize(lightnormal);
3899 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3900 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3902 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3903 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3904 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3905 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3909 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3910 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3911 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3912 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3914 buffer_FragColorbgra8[x*4+0] = d[0];
3915 buffer_FragColorbgra8[x*4+1] = d[1];
3916 buffer_FragColorbgra8[x*4+2] = d[2];
3917 buffer_FragColorbgra8[x*4+3] = d[3];
3922 for (x = startx;x < endx;x++)
3925 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3926 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3927 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3928 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3930 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3932 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3933 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3934 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3935 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3939 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3940 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3941 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3942 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3944 buffer_FragColorbgra8[x*4+0] = d[0];
3945 buffer_FragColorbgra8[x*4+1] = d[1];
3946 buffer_FragColorbgra8[x*4+2] = d[2];
3947 buffer_FragColorbgra8[x*4+3] = d[3];
3950 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3955 void DPSOFTRAST_VertexShader_LightSource(void)
3958 int numvertices = dpsoftrast.numvertices;
3959 float LightPosition[4];
3960 float LightVector[4];
3961 float LightVectorModelSpace[4];
3962 float EyePosition[4];
3963 float EyeVectorModelSpace[4];
3969 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3970 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3971 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3972 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3973 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3974 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3975 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3976 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3977 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3978 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3979 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3980 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3981 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3982 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3983 for (i = 0;i < numvertices;i++)
3985 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3986 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3987 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3988 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3989 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3990 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3991 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3992 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3993 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3994 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3995 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3996 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3997 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3998 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3999 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4000 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4001 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4002 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
4003 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4004 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4005 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4006 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4007 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4008 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4009 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4010 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4011 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4012 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4013 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4014 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4015 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4016 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4018 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4019 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4022 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4025 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4026 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4027 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4028 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4029 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4030 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4031 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4032 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4033 int x, startx = span->startx, endx = span->endx;
4034 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4035 float CubeVectordata[4];
4036 float CubeVectorslope[4];
4037 float LightVectordata[4];
4038 float LightVectorslope[4];
4039 float EyeVectordata[4];
4040 float EyeVectorslope[4];
4042 float diffusetex[4];
4044 float surfacenormal[4];
4045 float lightnormal[4];
4047 float specularnormal[4];
4050 float SpecularPower;
4051 float CubeVector[4];
4054 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4055 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4056 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4057 Color_Glow[3] = 0.0f;
4058 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4059 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4060 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4061 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4062 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4063 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4064 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4065 Color_Diffuse[3] = 0.0f;
4066 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4067 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4068 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4069 Color_Specular[3] = 0.0f;
4070 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4071 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4072 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4073 Color_Pants[3] = 0.0f;
4074 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4075 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4076 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4077 Color_Shirt[3] = 0.0f;
4078 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4079 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4080 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4081 LightColor[3] = 0.0f;
4082 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4083 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4084 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4085 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4086 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4087 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4088 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4089 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4091 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4092 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4094 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4095 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4096 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4098 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4099 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4100 for (x = startx;x < endx;x++)
4103 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4104 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4105 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4106 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4107 if (attenuation < 0.01f)
4109 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4111 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4112 if (attenuation < 0.01f)
4116 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4117 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4118 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4119 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4120 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4122 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4123 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4124 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4125 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4127 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4128 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4129 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4130 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4131 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4132 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4133 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4134 DPSOFTRAST_Vector3Normalize(surfacenormal);
4136 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4137 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4138 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4139 DPSOFTRAST_Vector3Normalize(lightnormal);
4141 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4143 if(thread->shader_exactspecularmath)
4145 // reflect lightnormal at surfacenormal, take the negative of that
4146 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4148 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4149 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4150 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4151 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4153 // dot of this and normalize(EyeVectorFogDepth.xyz)
4154 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4155 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4156 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4157 DPSOFTRAST_Vector3Normalize(eyenormal);
4159 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4163 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4164 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4165 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4166 DPSOFTRAST_Vector3Normalize(eyenormal);
4168 specularnormal[0] = lightnormal[0] + eyenormal[0];
4169 specularnormal[1] = lightnormal[1] + eyenormal[1];
4170 specularnormal[2] = lightnormal[2] + eyenormal[2];
4171 DPSOFTRAST_Vector3Normalize(specularnormal);
4173 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4175 specular = pow(specular, SpecularPower * glosstex[3]);
4177 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4179 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4180 attenuation *= (1.0f / 255.0f);
4181 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4182 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4183 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4184 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4188 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4189 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4190 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4191 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4193 buffer_FragColorbgra8[x*4+0] = d[0];
4194 buffer_FragColorbgra8[x*4+1] = d[1];
4195 buffer_FragColorbgra8[x*4+2] = d[2];
4196 buffer_FragColorbgra8[x*4+3] = d[3];
4199 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4201 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4202 for (x = startx;x < endx;x++)
4205 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4206 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4207 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4208 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4209 if (attenuation < 0.01f)
4211 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4213 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4214 if (attenuation < 0.01f)
4218 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4219 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4220 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4221 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4222 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4224 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4225 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4226 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4227 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4229 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4230 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4231 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4232 DPSOFTRAST_Vector3Normalize(surfacenormal);
4234 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4235 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4236 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4237 DPSOFTRAST_Vector3Normalize(lightnormal);
4239 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4240 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4242 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4243 attenuation *= (1.0f / 255.0f);
4244 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4245 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4246 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4247 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4251 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4252 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4253 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4254 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4256 buffer_FragColorbgra8[x*4+0] = d[0];
4257 buffer_FragColorbgra8[x*4+1] = d[1];
4258 buffer_FragColorbgra8[x*4+2] = d[2];
4259 buffer_FragColorbgra8[x*4+3] = d[3];
4264 for (x = startx;x < endx;x++)
4267 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4268 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4269 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4270 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4271 if (attenuation < 0.01f)
4273 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4275 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4276 if (attenuation < 0.01f)
4280 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4281 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4282 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4283 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4284 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4286 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4287 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4288 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4289 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4291 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4293 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4294 attenuation *= (1.0f / 255.0f);
4295 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4296 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4297 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4298 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4302 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4303 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4304 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4305 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4307 buffer_FragColorbgra8[x*4+0] = d[0];
4308 buffer_FragColorbgra8[x*4+1] = d[1];
4309 buffer_FragColorbgra8[x*4+2] = d[2];
4310 buffer_FragColorbgra8[x*4+3] = d[3];
4313 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4319 void DPSOFTRAST_VertexShader_Refraction(void)
4321 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4322 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4323 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4326 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4328 // DIRTY TRICK: only do sideways displacement. Not correct, but cheaper and thus better for SW.
4330 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4332 int x, startx = span->startx, endx = span->endx;
4335 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4336 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4339 float ModelViewProjectionPositiondata[4];
4340 float ModelViewProjectionPositionslope[4];
4343 float ScreenScaleRefractReflect[2];
4344 float ScreenCenterRefractReflect[2];
4345 float DistortScaleRefractReflect[2];
4346 float RefractColor[4];
4348 const unsigned char * RESTRICT pixelbase;
4349 const unsigned char * RESTRICT pixel[4];
4350 DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4351 if(!texture) return;
4352 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[0][0];
4355 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4356 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4359 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
4362 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4363 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4364 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4365 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4366 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4367 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4368 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4369 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4370 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4371 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4374 for (x = startx;x < endx;x++)
4376 float SafeScreenTexCoord[2];
4377 float ScreenTexCoord[2];
4384 // " vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4385 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4387 // " vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4388 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4389 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4391 // " vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4392 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4393 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4394 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4395 DPSOFTRAST_Vector3Normalize(v);
4396 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4397 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4399 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4400 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4402 unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<12) - 2048, ScreenTexCoord[1] * (texture->mipmap[0][3]<<12) - 2048};
4403 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
4404 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
4405 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
4406 int tci[2] = { tc[0]>>12, tc[1]>>12 };
4407 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4408 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4409 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4410 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4411 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4412 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4413 pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci1[0]);
4414 pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci[0]);
4415 pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci1[0]);
4416 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
4417 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
4418 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
4422 int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2], ScreenTexCoord[1] * texture->mipmap[0][3] };
4423 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4424 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4425 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4431 //p = (int) bound(startx, x + (ScreenTexCoord[0] - SafeScreenTexCoord[0]) / (ModelViewProjectionPositionslope[0]*z), endx-1);
4432 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4433 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4434 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4435 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4438 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4443 void DPSOFTRAST_VertexShader_Water(void)
4445 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4449 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4452 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4453 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4454 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4455 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4456 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4461 void DPSOFTRAST_VertexShader_ShowDepth(void)
4463 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4466 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4469 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4470 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4471 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4472 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4473 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4478 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4480 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4483 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4486 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4487 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4488 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4489 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4490 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4495 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4497 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4500 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4503 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4504 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4505 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4506 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4507 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4512 typedef struct DPSOFTRAST_ShaderModeInfo_s
4515 void (*Vertex)(void);
4516 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4517 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4518 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4520 DPSOFTRAST_ShaderModeInfo;
4522 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4524 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4525 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4526 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4527 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4528 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4529 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4530 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4531 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4532 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4533 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4534 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4535 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4536 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4537 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4538 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4539 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4542 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4547 unsigned int *depthpixel;
4551 unsigned char *pixelmask;
4552 DPSOFTRAST_State_Triangle *triangle;
4553 triangle = &thread->triangles[span->triangle];
4554 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4555 startx = span->startx;
4557 depth = span->depthbase;
4558 depthslope = span->depthslope;
4559 pixelmask = thread->pixelmaskarray;
4560 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4562 switch(thread->fb_depthfunc)
4565 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4566 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4567 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4568 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4569 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4570 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4571 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4573 while (startx < endx && !pixelmask[startx])
4575 while (endx > startx && !pixelmask[endx-1])
4580 // no depth testing means we're just dealing with color...
4581 memset(pixelmask + startx, 1, endx - startx);
4583 span->pixelmask = pixelmask;
4584 span->startx = startx;
4588 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4590 int x, d, depth, depthslope, startx, endx;
4591 const unsigned char *pixelmask;
4592 unsigned int *depthpixel;
4593 if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4595 depth = span->depthbase;
4596 depthslope = span->depthslope;
4597 pixelmask = span->pixelmask;
4598 startx = span->startx;
4600 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4601 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4607 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4610 DPSOFTRAST_State_Triangle *triangle;
4611 DPSOFTRAST_State_Span *span;
4612 for (i = 0; i < thread->numspans; i++)
4614 span = &thread->spans[i];
4615 triangle = &thread->triangles[span->triangle];
4616 DPSOFTRAST_Draw_DepthTest(thread, span);
4617 if (span->startx >= span->endx)
4619 // run pixel shader if appropriate
4620 // do this before running depthmask code, to allow the pixelshader
4621 // to clear pixelmask values for alpha testing
4622 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4623 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4624 DPSOFTRAST_Draw_DepthWrite(thread, span);
4626 thread->numspans = 0;
4629 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4631 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4634 int cullface = thread->cullface;
4635 int minx, maxx, miny, maxy;
4636 int miny1, maxy1, miny2, maxy2;
4637 __m128i fbmin, fbmax;
4638 __m128 viewportcenter, viewportscale;
4639 int firstvertex = command->firstvertex;
4640 int numvertices = command->numvertices;
4641 int numtriangles = command->numtriangles;
4642 const int *element3i = command->element3i;
4643 const unsigned short *element3s = command->element3s;
4644 int clipped = command->clipped;
4651 int starty, endy, bandy;
4655 float clip0origin, clip0slope;
4657 __m128 triangleedge1, triangleedge2, trianglenormal;
4660 DPSOFTRAST_State_Triangle *triangle;
4661 DPSOFTRAST_Texture *texture;
4662 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4663 miny = thread->fb_scissor[1];
4664 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4665 miny1 = bound(miny, thread->miny1, maxy);
4666 maxy1 = bound(miny, thread->maxy1, maxy);
4667 miny2 = bound(miny, thread->miny2, maxy);
4668 maxy2 = bound(miny, thread->maxy2, maxy);
4669 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4671 if (!ATOMIC_DECREMENT(command->refcount))
4673 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4674 MM_FREE(command->arrays);
4678 minx = thread->fb_scissor[0];
4679 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4680 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4681 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4682 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4683 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4684 screen[3] = _mm_setzero_ps();
4685 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4686 for (i = 0;i < numtriangles;i++)
4688 const float *screencoord4f = command->arrays;
4689 const float *arrays = screencoord4f + numvertices*4;
4691 // generate the 3 edges of this triangle
4692 // generate spans for the triangle - switch based on left split or right split classification of triangle
4695 e[0] = element3s[i*3+0] - firstvertex;
4696 e[1] = element3s[i*3+1] - firstvertex;
4697 e[2] = element3s[i*3+2] - firstvertex;
4701 e[0] = element3i[i*3+0] - firstvertex;
4702 e[1] = element3i[i*3+1] - firstvertex;
4703 e[2] = element3i[i*3+2] - firstvertex;
4712 #define SKIPBACKFACE \
4713 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4714 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4715 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4716 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4717 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4721 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4725 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4730 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4731 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4733 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4734 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4736 #define CLIPPEDVERTEXCOPY(k,p1) \
4737 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4739 #define GENATTRIBCOPY(attrib, p1) \
4740 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4741 #define GENATTRIBLERP(attrib, p1, p2) \
4743 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4744 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4746 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4750 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4751 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4752 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4753 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4754 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4755 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4756 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4762 // calculate distance from nearplane
4763 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4764 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4765 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4766 if (clipdist[0] >= 0.0f)
4768 if (clipdist[1] >= 0.0f)
4770 if (clipdist[2] >= 0.0f)
4773 // triangle is entirely in front of nearplane
4774 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4781 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4789 if (clipdist[2] >= 0.0f)
4791 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4798 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4805 else if (clipdist[1] >= 0.0f)
4807 if (clipdist[2] >= 0.0f)
4809 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4816 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4822 else if (clipdist[2] >= 0.0f)
4824 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4829 else continue; // triangle is entirely behind nearplane
4832 // calculate integer y coords for triangle points
4833 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4834 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4835 screenmin = _mm_min_epi16(screeni, screenir),
4836 screenmax = _mm_max_epi16(screeni, screenir);
4837 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4838 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4839 screenmin = _mm_max_epi16(screenmin, fbmin);
4840 screenmax = _mm_min_epi16(screenmax, fbmax);
4841 // skip offscreen triangles
4842 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4844 starty = _mm_extract_epi16(screenmin, 1);
4845 endy = _mm_extract_epi16(screenmax, 1)+1;
4846 if (starty >= maxy1 && endy <= miny2)
4848 screeny = _mm_srai_epi32(screeni, 16);
4851 triangle = &thread->triangles[thread->numtriangles];
4853 // calculate attribute plans for triangle data...
4854 // okay, this triangle is going to produce spans, we'd better project
4855 // the interpolants now (this is what gives perspective texturing),
4856 // this consists of simply multiplying all arrays by the W coord
4857 // (which is basically 1/Z), which will be undone per-pixel
4858 // (multiplying by Z again) to get the perspective-correct array
4861 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4862 __m128 mipedgescale, mipdensity;
4863 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4864 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4865 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4866 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4867 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4868 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4869 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4870 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4871 attribedge1 = _mm_sub_ss(w0, w1);
4872 attribedge2 = _mm_sub_ss(w2, w1);
4873 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4874 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4875 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4876 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4877 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4878 _mm_store_ss(&triangle->w[0], attribxslope);
4879 _mm_store_ss(&triangle->w[1], attribyslope);
4880 _mm_store_ss(&triangle->w[2], attriborigin);
4885 if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
4887 float cliporigin, clipxslope, clipyslope;
4888 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
4889 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
4890 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
4891 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4892 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4893 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4894 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
4895 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
4896 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
4899 clip0origin = -cliporigin/clipxslope;
4900 clip0slope = -clipyslope/clipxslope;
4901 clip0dir = clipxslope > 0 ? 1 : -1;
4903 else if(clipyslope > 0)
4905 clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
4906 clip0slope = dpsoftrast.fb_width;
4909 else if(clipyslope < 0)
4911 clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
4912 clip0slope = -dpsoftrast.fb_width;
4915 else if(clip0origin < 0) continue;
4918 mipedgescale = _mm_setzero_ps();
4919 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4921 __m128 attrib0, attrib1, attrib2;
4922 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4923 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4925 arrays += numvertices*4;
4926 GENATTRIBS(attrib0, attrib1, attrib2);
4927 attriborigin = _mm_mul_ps(attrib1, w1);
4928 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4929 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4930 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4931 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4932 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4933 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
4934 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
4935 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
4936 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4938 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4939 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4940 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4941 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4945 memset(triangle->mip, 0, sizeof(triangle->mip));
4946 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4948 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4949 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4951 texture = thread->texbound[texunit];
4952 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4954 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4955 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4956 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4957 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4958 // this will be multiplied in the texturing routine by the texture resolution
4959 y = _mm_cvtss_si32(mipdensity);
4962 y = (int)(log((float)y)*0.5f/M_LN2);
4963 if (y > texture->mipmaps - 1)
4964 y = texture->mipmaps - 1;
4965 triangle->mip[texunit] = y;
4971 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4974 __m128 xcoords, xslope;
4975 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4976 int yccmask = _mm_movemask_epi8(ycc);
4977 int edge0p, edge0n, edge1p, edge1n;
4986 case 0xFFFF: /*0000*/ y = endy; continue;
4987 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4988 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4989 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4990 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4991 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4992 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4993 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4994 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4995 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4996 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4997 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4998 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4999 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5000 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5001 case 0x0000: /*1111*/ y++; continue;
5009 case 0xFFFF: /*000*/ y = endy; continue;
5010 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5011 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5012 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5013 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5014 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5015 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5016 case 0x0000: /*111*/ y++; continue;
5019 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5020 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5021 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5022 nexty = _mm_extract_epi16(ycc, 0);
5023 if (nexty >= bandy) nexty = bandy-1;
5024 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5025 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5026 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5027 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5028 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5029 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5031 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5032 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5034 clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5035 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5037 int startx, endx, clipx = minx, offset;
5038 startx = _mm_cvtss_si32(xcoords);
5039 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5042 if (startx < 0) startx = 0;
5043 startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
5045 if (endx > maxx) endx = maxx;
5046 if (startx >= endx) continue;
5054 if(endx <= clip0) continue;
5055 clipx = max((int)clip0, minx);
5056 startx += (clipx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
5059 else if (endx > clip0)
5061 if(startx >= clip0) continue;
5066 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5068 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5069 span->triangle = thread->numtriangles;
5072 span->startx = max(clipx - offset, 0);
5073 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5074 if (span->startx >= span->endx)
5076 wslope = triangle->w[0];
5077 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5078 span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5079 span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5080 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5081 DPSOFTRAST_Draw_ProcessSpans(thread);
5086 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5088 DPSOFTRAST_Draw_ProcessSpans(thread);
5089 thread->numtriangles = 0;
5093 if (!ATOMIC_DECREMENT(command->refcount))
5095 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5096 MM_FREE(command->arrays);
5099 if (thread->numspans > 0 || thread->numtriangles > 0)
5101 DPSOFTRAST_Draw_ProcessSpans(thread);
5102 thread->numtriangles = 0;
5107 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5111 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5112 int datasize = 2*numvertices*sizeof(float[4]);
5113 DPSOFTRAST_Command_Draw *command;
5114 unsigned char *data;
5115 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5117 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5118 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5120 datasize += numvertices*sizeof(float[4]);
5123 datasize += numtriangles*sizeof(unsigned short[3]);
5125 datasize += numtriangles*sizeof(int[3]);
5126 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5127 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5129 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5130 data = (unsigned char *)MM_CALLOC(datasize, 1);
5134 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5135 data = (unsigned char *)command + commandsize;
5137 command->firstvertex = firstvertex;
5138 command->numvertices = numvertices;
5139 command->numtriangles = numtriangles;
5140 command->arrays = (float *)data;
5141 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5142 dpsoftrast.firstvertex = firstvertex;
5143 dpsoftrast.numvertices = numvertices;
5144 dpsoftrast.screencoord4f = (float *)data;
5145 data += numvertices*sizeof(float[4]);
5146 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5147 data += numvertices*sizeof(float[4]);
5148 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5150 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5151 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5153 dpsoftrast.post_array4f[j] = (float *)data;
5154 data += numvertices*sizeof(float[4]);
5156 command->element3i = NULL;
5157 command->element3s = NULL;
5160 command->element3s = (unsigned short *)data;
5161 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5165 command->element3i = (int *)data;
5166 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5171 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5173 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5174 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5175 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5176 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5177 if (command->starty >= command->endy)
5179 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5180 MM_FREE(command->arrays);
5181 DPSOFTRAST_UndoCommand(command->commandsize);
5184 command->clipped = dpsoftrast.drawclipped;
5185 command->refcount = dpsoftrast.numthreads;
5187 if (dpsoftrast.usethreads)
5190 DPSOFTRAST_Draw_SyncCommands();
5191 for (i = 0; i < dpsoftrast.numthreads; i++)
5193 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5194 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5195 Thread_CondSignal(thread->drawcond);
5200 DPSOFTRAST_Draw_FlushThreads();
5204 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5205 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5207 thread->validate |= DPSOFTRAST_VALIDATE_FB;
5209 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5211 DPSOFTRAST_Command_SetRenderTargets *command;
5212 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5213 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5214 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5216 dpsoftrast.fb_width = width;
5217 dpsoftrast.fb_height = height;
5218 dpsoftrast.fb_depthpixels = depthpixels;
5219 dpsoftrast.fb_colorpixels[0] = colorpixels0;
5220 dpsoftrast.fb_colorpixels[1] = colorpixels1;
5221 dpsoftrast.fb_colorpixels[2] = colorpixels2;
5222 dpsoftrast.fb_colorpixels[3] = colorpixels3;
5223 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5224 command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5225 command->width = width;
5226 command->height = height;
5229 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5231 int commandoffset = thread->commandoffset;
5232 while (commandoffset != endoffset)
5234 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5235 switch (command->opcode)
5237 #define INTERPCOMMAND(name) \
5238 case DPSOFTRAST_OPCODE_##name : \
5239 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5240 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5241 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5242 commandoffset = 0; \
5244 INTERPCOMMAND(Viewport)
5245 INTERPCOMMAND(ClearColor)
5246 INTERPCOMMAND(ClearDepth)
5247 INTERPCOMMAND(ColorMask)
5248 INTERPCOMMAND(DepthTest)
5249 INTERPCOMMAND(ScissorTest)
5250 INTERPCOMMAND(Scissor)
5251 INTERPCOMMAND(BlendFunc)
5252 INTERPCOMMAND(BlendSubtract)
5253 INTERPCOMMAND(DepthMask)
5254 INTERPCOMMAND(DepthFunc)
5255 INTERPCOMMAND(DepthRange)
5256 INTERPCOMMAND(PolygonOffset)
5257 INTERPCOMMAND(CullFace)
5258 INTERPCOMMAND(AlphaTest)
5259 INTERPCOMMAND(AlphaFunc)
5260 INTERPCOMMAND(SetTexture)
5261 INTERPCOMMAND(SetShader)
5262 INTERPCOMMAND(Uniform4f)
5263 INTERPCOMMAND(UniformMatrix4f)
5264 INTERPCOMMAND(Uniform1i)
5265 INTERPCOMMAND(SetRenderTargets)
5266 INTERPCOMMAND(ClipPlane)
5268 case DPSOFTRAST_OPCODE_Draw:
5269 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5270 commandoffset += command->commandsize;
5271 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5273 thread->commandoffset = commandoffset;
5276 case DPSOFTRAST_OPCODE_Reset:
5281 thread->commandoffset = commandoffset;
5284 static int DPSOFTRAST_Draw_Thread(void *data)
5286 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5287 while(thread->index >= 0)
5289 if (thread->commandoffset != dpsoftrast.drawcommand)
5291 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5295 Thread_LockMutex(thread->drawmutex);
5296 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5298 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5299 thread->starving = true;
5300 Thread_CondWait(thread->drawcond, thread->drawmutex);
5301 thread->starving = false;
5303 Thread_UnlockMutex(thread->drawmutex);
5309 static void DPSOFTRAST_Draw_FlushThreads(void)
5311 DPSOFTRAST_State_Thread *thread;
5313 DPSOFTRAST_Draw_SyncCommands();
5314 if (dpsoftrast.usethreads)
5316 for (i = 0; i < dpsoftrast.numthreads; i++)
5318 thread = &dpsoftrast.threads[i];
5319 if (thread->commandoffset != dpsoftrast.drawcommand)
5321 Thread_LockMutex(thread->drawmutex);
5322 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5323 Thread_CondSignal(thread->drawcond);
5324 Thread_UnlockMutex(thread->drawmutex);
5327 for (i = 0; i < dpsoftrast.numthreads; i++)
5329 thread = &dpsoftrast.threads[i];
5330 if (thread->commandoffset != dpsoftrast.drawcommand)
5332 Thread_LockMutex(thread->drawmutex);
5333 if (thread->commandoffset != dpsoftrast.drawcommand)
5335 thread->waiting = true;
5336 Thread_CondWait(thread->waitcond, thread->drawmutex);
5337 thread->waiting = false;
5339 Thread_UnlockMutex(thread->drawmutex);
5345 for (i = 0; i < dpsoftrast.numthreads; i++)
5347 thread = &dpsoftrast.threads[i];
5348 if (thread->commandoffset != dpsoftrast.drawcommand)
5349 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5352 dpsoftrast.commandpool.usedcommands = 0;
5355 void DPSOFTRAST_Flush(void)
5357 DPSOFTRAST_Draw_FlushThreads();
5360 void DPSOFTRAST_Finish(void)
5365 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5375 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5376 dpsoftrast.bigendian = u.b[3];
5377 dpsoftrast.fb_width = width;
5378 dpsoftrast.fb_height = height;
5379 dpsoftrast.fb_depthpixels = depthpixels;
5380 dpsoftrast.fb_colorpixels[0] = colorpixels;
5381 dpsoftrast.fb_colorpixels[1] = NULL;
5382 dpsoftrast.fb_colorpixels[1] = NULL;
5383 dpsoftrast.fb_colorpixels[1] = NULL;
5384 dpsoftrast.viewport[0] = 0;
5385 dpsoftrast.viewport[1] = 0;
5386 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5387 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5388 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5389 dpsoftrast.texture_firstfree = 1;
5390 dpsoftrast.texture_end = 1;
5391 dpsoftrast.texture_max = 0;
5392 dpsoftrast.color[0] = 1;
5393 dpsoftrast.color[1] = 1;
5394 dpsoftrast.color[2] = 1;
5395 dpsoftrast.color[3] = 1;
5396 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5397 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5398 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5399 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5400 for (i = 0; i < dpsoftrast.numthreads; i++)
5402 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5404 thread->cullface = GL_BACK;
5405 thread->colormask[1] = 1;
5406 thread->colormask[2] = 1;
5407 thread->colormask[3] = 1;
5408 thread->blendfunc[0] = GL_ONE;
5409 thread->blendfunc[1] = GL_ZERO;
5410 thread->depthmask = true;
5411 thread->depthtest = true;
5412 thread->depthfunc = GL_LEQUAL;
5413 thread->scissortest = false;
5414 thread->alphatest = false;
5415 thread->alphafunc = GL_GREATER;
5416 thread->alphavalue = 0.5f;
5417 thread->viewport[0] = 0;
5418 thread->viewport[1] = 0;
5419 thread->viewport[2] = dpsoftrast.fb_width;
5420 thread->viewport[3] = dpsoftrast.fb_height;
5421 thread->scissor[0] = 0;
5422 thread->scissor[1] = 0;
5423 thread->scissor[2] = dpsoftrast.fb_width;
5424 thread->scissor[3] = dpsoftrast.fb_height;
5425 thread->depthrange[0] = 0;
5426 thread->depthrange[1] = 1;
5427 thread->polygonoffset[0] = 0;
5428 thread->polygonoffset[1] = 0;
5429 thread->clipplane[0] = 0;
5430 thread->clipplane[1] = 0;
5431 thread->clipplane[2] = 0;
5432 thread->clipplane[3] = 1;
5434 thread->numspans = 0;
5435 thread->numtriangles = 0;
5436 thread->commandoffset = 0;
5437 thread->waiting = false;
5438 thread->starving = false;
5440 thread->validate = -1;
5441 DPSOFTRAST_Validate(thread, -1);
5443 if (dpsoftrast.usethreads)
5445 thread->waitcond = Thread_CreateCond();
5446 thread->drawcond = Thread_CreateCond();
5447 thread->drawmutex = Thread_CreateMutex();
5448 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5454 void DPSOFTRAST_Shutdown(void)
5457 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5459 DPSOFTRAST_State_Thread *thread;
5460 for (i = 0; i < dpsoftrast.numthreads; i++)
5462 thread = &dpsoftrast.threads[i];
5463 Thread_LockMutex(thread->drawmutex);
5465 Thread_CondSignal(thread->drawcond);
5466 Thread_UnlockMutex(thread->drawmutex);
5467 Thread_WaitThread(thread->thread, 0);
5468 Thread_DestroyCond(thread->waitcond);
5469 Thread_DestroyCond(thread->drawcond);
5470 Thread_DestroyMutex(thread->drawmutex);
5473 for (i = 0;i < dpsoftrast.texture_end;i++)
5474 if (dpsoftrast.texture[i].bytes)
5475 MM_FREE(dpsoftrast.texture[i].bytes);
5476 if (dpsoftrast.texture)
5477 free(dpsoftrast.texture);
5478 if (dpsoftrast.threads)
5479 MM_FREE(dpsoftrast.threads);
5480 memset(&dpsoftrast, 0, sizeof(dpsoftrast));