3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 #pragma warning(disable : 4324)
14 typedef qboolean bool;
18 #define ATOMIC_SIZE 32
21 #if defined(__APPLE__)
22 #include <libkern/OSAtomic.h>
23 #define ALIGN(var) var __attribute__((__aligned__(16)))
24 #define ATOMIC(var) var __attribute__((__aligned__(32)))
25 #define MEMORY_BARRIER (_mm_sfence())
26 #define ATOMIC_COUNTER volatile int32_t
27 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30 #elif defined(__GNUC__)
31 #define ALIGN(var) var __attribute__((__aligned__(16)))
32 #define ATOMIC(var) var __attribute__((__aligned__(32)))
33 #define MEMORY_BARRIER (_mm_sfence())
34 //(__sync_synchronize())
35 #define ATOMIC_COUNTER volatile int
36 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
37 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
38 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
39 #elif defined(_MSC_VER)
40 #define ALIGN(var) __declspec(align(16)) var
41 #define ATOMIC(var) __declspec(align(32)) var
42 #define MEMORY_BARRIER (_mm_sfence())
44 #define ATOMIC_COUNTER volatile LONG
45 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
46 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
47 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
52 #define ALIGN(var) var
55 #define ATOMIC(var) var
57 #ifndef MEMORY_BARRIER
58 #define MEMORY_BARRIER ((void)0)
60 #ifndef ATOMIC_COUNTER
61 #define ATOMIC_COUNTER int
63 #ifndef ATOMIC_INCREMENT
64 #define ATOMIC_INCREMENT(counter) (++(counter))
66 #ifndef ATOMIC_DECREMENT
67 #define ATOMIC_DECREMENT(counter) (--(counter))
70 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
74 #include <emmintrin.h>
76 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
78 static void *MM_CALLOC(size_t nmemb, size_t size)
80 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
81 if (ptr != NULL) memset(ptr, 0, nmemb*size);
85 #define MM_FREE _mm_free
87 #define MM_MALLOC(size) malloc(size)
88 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
92 typedef enum DPSOFTRAST_ARRAY_e
94 DPSOFTRAST_ARRAY_POSITION,
95 DPSOFTRAST_ARRAY_COLOR,
96 DPSOFTRAST_ARRAY_TEXCOORD0,
97 DPSOFTRAST_ARRAY_TEXCOORD1,
98 DPSOFTRAST_ARRAY_TEXCOORD2,
99 DPSOFTRAST_ARRAY_TEXCOORD3,
100 DPSOFTRAST_ARRAY_TEXCOORD4,
101 DPSOFTRAST_ARRAY_TEXCOORD5,
102 DPSOFTRAST_ARRAY_TEXCOORD6,
103 DPSOFTRAST_ARRAY_TEXCOORD7,
104 DPSOFTRAST_ARRAY_TOTAL
108 typedef struct DPSOFTRAST_Texture_s
115 DPSOFTRAST_TEXTURE_FILTER filter;
118 ATOMIC_COUNTER binds;
119 unsigned char *bytes;
120 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
124 #define COMMAND_SIZE ALIGN_SIZE
125 #define COMMAND_ALIGN(var) ALIGN(var)
127 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
129 unsigned char opcode;
130 unsigned short commandsize;
134 enum { DPSOFTRAST_OPCODE_Reset = 0 };
136 #define DEFCOMMAND(opcodeval, name, fields) \
137 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
138 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
140 unsigned char opcode; \
141 unsigned short commandsize; \
143 } DPSOFTRAST_Command_##name );
145 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
146 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
148 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
152 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
154 DPSOFTRAST_State_Command_Pool);
156 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
158 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
160 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
162 DPSOFTRAST_State_Triangle);
164 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
165 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
166 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
167 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
168 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
170 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
171 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
172 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
173 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
174 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
175 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
176 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
177 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
178 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
181 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
183 typedef ALIGN(struct DPSOFTRAST_State_Span_s
185 int triangle; // triangle this span was generated by
186 int x; // framebuffer x coord
187 int y; // framebuffer y coord
188 int startx; // usable range (according to pixelmask)
189 int endx; // usable range (according to pixelmask)
190 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
191 int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
192 int depthslope; // depthbuffer value pixel delta
194 DPSOFTRAST_State_Span);
196 #define DPSOFTRAST_DRAW_MAXSPANS 1024
197 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
198 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
200 #define DPSOFTRAST_VALIDATE_FB 1
201 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
202 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
203 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
205 typedef enum DPSOFTRAST_BLENDMODE_e
207 DPSOFTRAST_BLENDMODE_OPAQUE,
208 DPSOFTRAST_BLENDMODE_ALPHA,
209 DPSOFTRAST_BLENDMODE_ADDALPHA,
210 DPSOFTRAST_BLENDMODE_ADD,
211 DPSOFTRAST_BLENDMODE_INVMOD,
212 DPSOFTRAST_BLENDMODE_MUL,
213 DPSOFTRAST_BLENDMODE_MUL2,
214 DPSOFTRAST_BLENDMODE_SUBALPHA,
215 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
216 DPSOFTRAST_BLENDMODE_INVADD,
217 DPSOFTRAST_BLENDMODE_TOTAL
219 DPSOFTRAST_BLENDMODE;
221 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
240 float polygonoffset[2];
242 ALIGN(float fb_clipplane[4]);
245 int shader_permutation;
246 int shader_exactspecularmath;
248 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
250 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
251 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
253 // DPSOFTRAST_VALIDATE_ flags
256 // derived values (DPSOFTRAST_VALIDATE_FB)
259 ALIGN(float fb_viewportcenter[4]);
260 ALIGN(float fb_viewportscale[4]);
262 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
265 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
274 ATOMIC(volatile int commandoffset);
276 volatile bool waiting;
277 volatile bool starving;
284 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
285 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
286 unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
288 DPSOFTRAST_State_Thread);
290 typedef ATOMIC(struct DPSOFTRAST_State_s
294 unsigned int *fb_depthpixels;
295 unsigned int *fb_colorpixels[4];
298 ALIGN(float fb_viewportcenter[4]);
299 ALIGN(float fb_viewportscale[4]);
302 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
303 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
305 const float *pointer_vertex3f;
306 const float *pointer_color4f;
307 const unsigned char *pointer_color4ub;
308 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
311 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
312 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
313 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
317 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
318 float *screencoord4f;
324 int shader_permutation;
325 int shader_exactspecularmath;
329 int texture_firstfree;
330 DPSOFTRAST_Texture *texture;
335 const char *errorstring;
340 DPSOFTRAST_State_Thread *threads;
342 ATOMIC(volatile int drawcommand);
344 DPSOFTRAST_State_Command_Pool commandpool;
348 DPSOFTRAST_State dpsoftrast;
350 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
351 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
352 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
353 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
355 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
356 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
358 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
360 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
361 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
362 fb_viewportcenter[3] = 0.5f;
363 fb_viewportcenter[0] = 0.0f;
364 fb_viewportscale[1] = 0.5f * viewport[2];
365 fb_viewportscale[2] = -0.5f * viewport[3];
366 fb_viewportscale[3] = 0.5f;
367 fb_viewportscale[0] = 1.0f;
370 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
372 if (dpsoftrast.interlace)
374 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
375 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
376 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
377 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
381 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
382 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
386 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
388 thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
389 thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
390 thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
391 thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
392 thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
395 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
397 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
398 // and viewport projection values
401 x1 = thread->scissor[0];
402 x2 = thread->scissor[0] + thread->scissor[2];
403 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
404 y2 = dpsoftrast.fb_height - thread->scissor[1];
405 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
407 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
409 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
410 thread->fb_scissor[0] = x1;
411 thread->fb_scissor[1] = y1;
412 thread->fb_scissor[2] = x2 - x1;
413 thread->fb_scissor[3] = y2 - y1;
415 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
416 DPSOFTRAST_RecalcClipPlane(thread);
417 DPSOFTRAST_RecalcThread(thread);
420 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
422 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
425 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
427 if (thread->blendsubtract)
429 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
431 #define BLENDFUNC(sfactor, dfactor, blendmode) \
432 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
433 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
434 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
439 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
441 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
442 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
443 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
444 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
445 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
446 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
447 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
448 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
449 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
450 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
451 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
456 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
458 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
460 mask &= thread->validate;
463 if (mask & DPSOFTRAST_VALIDATE_FB)
465 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
466 DPSOFTRAST_RecalcFB(thread);
468 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
470 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
471 DPSOFTRAST_RecalcDepthFunc(thread);
473 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
475 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
476 DPSOFTRAST_RecalcBlendFunc(thread);
480 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
482 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
483 return &dpsoftrast.texture[index];
487 static void DPSOFTRAST_Texture_Grow(void)
489 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
490 DPSOFTRAST_State_Thread *thread;
494 // expand texture array as needed
495 if (dpsoftrast.texture_max < 1024)
496 dpsoftrast.texture_max = 1024;
498 dpsoftrast.texture_max *= 2;
499 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
500 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
501 if (dpsoftrast.texbound[i])
502 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
503 for (j = 0; j < dpsoftrast.numthreads; j++)
505 thread = &dpsoftrast.threads[j];
506 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
507 if (thread->texbound[i])
508 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
512 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
521 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
522 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
523 DPSOFTRAST_Texture *texture;
524 if (width*height*depth < 1)
526 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
529 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
531 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
536 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
537 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
538 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
540 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
541 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
543 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
548 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
551 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
553 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
558 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
560 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
563 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
565 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
568 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
570 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
573 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
575 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
578 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
580 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
583 // find first empty slot in texture array
584 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
585 if (!dpsoftrast.texture[texnum].bytes)
587 dpsoftrast.texture_firstfree = texnum + 1;
588 if (dpsoftrast.texture_max <= texnum)
589 DPSOFTRAST_Texture_Grow();
590 if (dpsoftrast.texture_end <= texnum)
591 dpsoftrast.texture_end = texnum + 1;
592 texture = &dpsoftrast.texture[texnum];
593 memset(texture, 0, sizeof(*texture));
594 texture->flags = flags;
595 texture->width = width;
596 texture->height = height;
597 texture->depth = depth;
598 texture->sides = sides;
610 s = w * h * d * sides * 4;
611 texture->mipmap[mipmaps][0] = size;
612 texture->mipmap[mipmaps][1] = s;
613 texture->mipmap[mipmaps][2] = w;
614 texture->mipmap[mipmaps][3] = h;
615 texture->mipmap[mipmaps][4] = d;
618 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
624 texture->mipmaps = mipmaps;
625 texture->size = size;
627 // allocate the pixels now
628 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
632 void DPSOFTRAST_Texture_Free(int index)
634 DPSOFTRAST_Texture *texture;
635 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
639 MM_FREE(texture->bytes);
640 texture->bytes = NULL;
641 memset(texture, 0, sizeof(*texture));
642 // adjust the free range and used range
643 if (dpsoftrast.texture_firstfree > index)
644 dpsoftrast.texture_firstfree = index;
645 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
646 dpsoftrast.texture_end--;
648 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
650 int i, x, y, z, w, layer0, layer1, row0, row1;
651 unsigned char *o, *i0, *i1, *i2, *i3;
652 DPSOFTRAST_Texture *texture;
653 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
654 if (texture->mipmaps <= 1)
656 for (i = 1;i < texture->mipmaps;i++)
658 for (z = 0;z < texture->mipmap[i][4];z++)
662 if (layer1 >= texture->mipmap[i-1][4])
663 layer1 = texture->mipmap[i-1][4]-1;
664 for (y = 0;y < texture->mipmap[i][3];y++)
668 if (row1 >= texture->mipmap[i-1][3])
669 row1 = texture->mipmap[i-1][3]-1;
670 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
671 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
672 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
673 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
674 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
675 w = texture->mipmap[i][2];
678 if (texture->mipmap[i-1][2] > 1)
680 // average 3D texture
681 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
683 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
684 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
685 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
686 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
691 // average 3D mipmap with parent width == 1
692 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
694 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
695 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
696 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
697 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
703 if (texture->mipmap[i-1][2] > 1)
705 // average 2D texture (common case)
706 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
708 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
709 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
710 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
711 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
716 // 2D texture with parent width == 1
717 o[0] = (i0[0] + i1[0] + 1) >> 1;
718 o[1] = (i0[1] + i1[1] + 1) >> 1;
719 o[2] = (i0[2] + i1[2] + 1) >> 1;
720 o[3] = (i0[3] + i1[3] + 1) >> 1;
727 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
729 DPSOFTRAST_Texture *texture;
731 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
736 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
737 while (blockheight > 0)
739 memcpy(dst, pixels, blockwidth * 4);
740 pixels += blockwidth * 4;
741 dst += texture->mipmap[0][2] * 4;
745 DPSOFTRAST_Texture_CalculateMipmaps(index);
747 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
749 DPSOFTRAST_Texture *texture;
750 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
754 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
755 DPSOFTRAST_Texture_CalculateMipmaps(index);
757 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
759 DPSOFTRAST_Texture *texture;
760 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
761 return texture->mipmap[mip][2];
763 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
765 DPSOFTRAST_Texture *texture;
766 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
767 return texture->mipmap[mip][3];
769 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
771 DPSOFTRAST_Texture *texture;
772 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
773 return texture->mipmap[mip][4];
775 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
777 DPSOFTRAST_Texture *texture;
778 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
781 return texture->bytes + texture->mipmap[mip][0];
783 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
785 DPSOFTRAST_Texture *texture;
786 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
787 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
789 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
794 texture->filter = filter;
797 static void DPSOFTRAST_Draw_FlushThreads(void);
799 static void DPSOFTRAST_Draw_SyncCommands(void)
801 if(dpsoftrast.usethreads) MEMORY_BARRIER;
802 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
805 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
807 DPSOFTRAST_State_Thread *thread;
809 int freecommand = dpsoftrast.commandpool.freecommand;
810 int usedcommands = dpsoftrast.commandpool.usedcommands;
811 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
813 DPSOFTRAST_Draw_SyncCommands();
819 for (i = 0; i < dpsoftrast.numthreads; i++)
821 thread = &dpsoftrast.threads[i];
822 commandoffset = freecommand - thread->commandoffset;
823 if (commandoffset < 0)
824 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
825 if (commandoffset > usedcommands)
828 usedcommands = commandoffset;
831 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
833 thread = &dpsoftrast.threads[waitindex];
834 Thread_LockMutex(thread->drawmutex);
835 if (thread->commandoffset != dpsoftrast.drawcommand)
837 thread->waiting = true;
838 if (thread->starving) Thread_CondSignal(thread->drawcond);
839 Thread_CondWait(thread->waitcond, thread->drawmutex);
840 thread->waiting = false;
842 Thread_UnlockMutex(thread->drawmutex);
844 dpsoftrast.commandpool.usedcommands = usedcommands;
847 #define DPSOFTRAST_ALIGNCOMMAND(size) \
848 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
849 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
850 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
852 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
854 DPSOFTRAST_Command *command;
855 int freecommand = dpsoftrast.commandpool.freecommand;
856 int usedcommands = dpsoftrast.commandpool.usedcommands;
857 int extra = sizeof(DPSOFTRAST_Command);
858 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
859 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
860 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
862 if (dpsoftrast.usethreads)
863 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
865 DPSOFTRAST_Draw_FlushThreads();
866 freecommand = dpsoftrast.commandpool.freecommand;
867 usedcommands = dpsoftrast.commandpool.usedcommands;
869 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
871 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
872 command->opcode = DPSOFTRAST_OPCODE_Reset;
873 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
876 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
877 command->opcode = opcode;
878 command->commandsize = size;
880 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
882 dpsoftrast.commandpool.freecommand = freecommand;
883 dpsoftrast.commandpool.usedcommands = usedcommands + size;
887 static void DPSOFTRAST_UndoCommand(int size)
889 int freecommand = dpsoftrast.commandpool.freecommand;
890 int usedcommands = dpsoftrast.commandpool.usedcommands;
893 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
894 usedcommands -= size;
895 dpsoftrast.commandpool.freecommand = freecommand;
896 dpsoftrast.commandpool.usedcommands = usedcommands;
899 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
900 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
902 thread->viewport[0] = command->x;
903 thread->viewport[1] = command->y;
904 thread->viewport[2] = command->width;
905 thread->viewport[3] = command->height;
906 thread->validate |= DPSOFTRAST_VALIDATE_FB;
908 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
910 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
913 command->width = width;
914 command->height = height;
916 dpsoftrast.viewport[0] = x;
917 dpsoftrast.viewport[1] = y;
918 dpsoftrast.viewport[2] = width;
919 dpsoftrast.viewport[3] = height;
920 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
923 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
924 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
926 int i, x1, y1, x2, y2, w, h, x, y;
927 int miny1, maxy1, miny2, maxy2;
931 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
932 miny1 = thread->miny1;
933 maxy1 = thread->maxy1;
934 miny2 = thread->miny2;
935 maxy2 = thread->maxy2;
936 x1 = thread->fb_scissor[0];
937 y1 = thread->fb_scissor[1];
938 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
939 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
940 if (y1 < miny1) y1 = miny1;
941 if (y2 > maxy2) y2 = maxy2;
946 // FIXME: honor fb_colormask?
947 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
948 for (i = 0;i < 4;i++)
950 if (!dpsoftrast.fb_colorpixels[i])
952 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
955 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
956 for (x = x1;x < x2;x++)
961 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
963 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
970 DEFCOMMAND(3, ClearDepth, float depth;)
971 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
973 int x1, y1, x2, y2, w, h, x, y;
974 int miny1, maxy1, miny2, maxy2;
978 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
979 miny1 = thread->miny1;
980 maxy1 = thread->maxy1;
981 miny2 = thread->miny2;
982 maxy2 = thread->maxy2;
983 x1 = thread->fb_scissor[0];
984 y1 = thread->fb_scissor[1];
985 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
986 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
987 if (y1 < miny1) y1 = miny1;
988 if (y2 > maxy2) y2 = maxy2;
993 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
994 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
997 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
998 for (x = x1;x < x2;x++)
1002 void DPSOFTRAST_ClearDepth(float d)
1004 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1008 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1009 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1011 thread->colormask[0] = command->r != 0;
1012 thread->colormask[1] = command->g != 0;
1013 thread->colormask[2] = command->b != 0;
1014 thread->colormask[3] = command->a != 0;
1015 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1017 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1019 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1026 DEFCOMMAND(5, DepthTest, int enable;)
1027 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1029 thread->depthtest = command->enable;
1030 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1032 void DPSOFTRAST_DepthTest(int enable)
1034 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1035 command->enable = enable;
1038 DEFCOMMAND(6, ScissorTest, int enable;)
1039 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1041 thread->scissortest = command->enable;
1042 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1044 void DPSOFTRAST_ScissorTest(int enable)
1046 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1047 command->enable = enable;
1050 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1051 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1053 thread->scissor[0] = command->x;
1054 thread->scissor[1] = command->y;
1055 thread->scissor[2] = command->width;
1056 thread->scissor[3] = command->height;
1057 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1059 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1061 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1064 command->width = width;
1065 command->height = height;
1068 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1069 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1071 thread->blendfunc[0] = command->sfactor;
1072 thread->blendfunc[1] = command->dfactor;
1073 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1075 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1077 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1078 command->sfactor = sfactor;
1079 command->dfactor = dfactor;
1082 DEFCOMMAND(9, BlendSubtract, int enable;)
1083 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1085 thread->blendsubtract = command->enable;
1086 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1088 void DPSOFTRAST_BlendSubtract(int enable)
1090 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1091 command->enable = enable;
1094 DEFCOMMAND(10, DepthMask, int enable;)
1095 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1097 thread->depthmask = command->enable;
1099 void DPSOFTRAST_DepthMask(int enable)
1101 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1102 command->enable = enable;
1105 DEFCOMMAND(11, DepthFunc, int func;)
1106 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1108 thread->depthfunc = command->func;
1110 void DPSOFTRAST_DepthFunc(int func)
1112 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1113 command->func = func;
1116 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1117 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1119 thread->depthrange[0] = command->nearval;
1120 thread->depthrange[1] = command->farval;
1122 void DPSOFTRAST_DepthRange(float nearval, float farval)
1124 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1125 command->nearval = nearval;
1126 command->farval = farval;
1129 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1130 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1132 thread->polygonoffset[0] = command->alongnormal;
1133 thread->polygonoffset[1] = command->intoview;
1135 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1137 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1138 command->alongnormal = alongnormal;
1139 command->intoview = intoview;
1142 DEFCOMMAND(14, CullFace, int mode;)
1143 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1145 thread->cullface = command->mode;
1147 void DPSOFTRAST_CullFace(int mode)
1149 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1150 command->mode = mode;
1153 DEFCOMMAND(15, AlphaTest, int enable;)
1154 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1156 thread->alphatest = command->enable;
1158 void DPSOFTRAST_AlphaTest(int enable)
1160 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1161 command->enable = enable;
1164 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1165 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1167 thread->alphafunc = command->func;
1168 thread->alphavalue = command->ref;
1170 void DPSOFTRAST_AlphaFunc(int func, float ref)
1172 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1173 command->func = func;
1177 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1179 dpsoftrast.color[0] = r;
1180 dpsoftrast.color[1] = g;
1181 dpsoftrast.color[2] = b;
1182 dpsoftrast.color[3] = a;
1185 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1187 int outstride = blockwidth * 4;
1188 int instride = dpsoftrast.fb_width * 4;
1191 int bx2 = blockx + blockwidth;
1192 int by2 = blocky + blockheight;
1196 unsigned char *inpixels;
1200 if (bx1 < 0) bx1 = 0;
1201 if (by1 < 0) by1 = 0;
1202 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1203 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1205 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1206 if (dpsoftrast.bigendian)
1208 for (y = by1;y < by2;y++)
1210 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1211 o = (unsigned char *)outpixels + (y - by1) * outstride;
1212 for (x = bx1;x < bx2;x++)
1225 for (y = by1;y < by2;y++)
1227 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1228 o = (unsigned char *)outpixels + (y - by1) * outstride;
1234 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1238 int tx2 = tx + width;
1239 int ty2 = ty + height;
1242 int sx2 = sx + width;
1243 int sy2 = sy + height;
1253 unsigned int *spixels;
1254 unsigned int *tpixels;
1255 DPSOFTRAST_Texture *texture;
1256 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1257 if (mip < 0 || mip >= texture->mipmaps) return;
1259 spixels = dpsoftrast.fb_colorpixels[0];
1260 swidth = dpsoftrast.fb_width;
1261 sheight = dpsoftrast.fb_height;
1262 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1263 twidth = texture->mipmap[mip][2];
1264 theight = texture->mipmap[mip][3];
1265 if (tx1 < 0) tx1 = 0;
1266 if (ty1 < 0) ty1 = 0;
1267 if (tx2 > twidth) tx2 = twidth;
1268 if (ty2 > theight) ty2 = theight;
1269 if (sx1 < 0) sx1 = 0;
1270 if (sy1 < 0) sy1 = 0;
1271 if (sx2 > swidth) sx2 = swidth;
1272 if (sy2 > sheight) sy2 = sheight;
1277 if (tw > sw) tw = sw;
1278 if (th > sh) th = sh;
1279 if (tw < 1 || th < 1)
1281 sy1 = sheight - 1 - sy1;
1282 for (y = 0;y < th;y++)
1283 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1284 if (texture->mipmaps > 1)
1285 DPSOFTRAST_Texture_CalculateMipmaps(index);
1288 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1289 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1291 if (thread->texbound[command->unitnum])
1292 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1293 thread->texbound[command->unitnum] = command->texture;
1295 void DPSOFTRAST_SetTexture(int unitnum, int index)
1297 DPSOFTRAST_Command_SetTexture *command;
1298 DPSOFTRAST_Texture *texture;
1299 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1301 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1304 texture = DPSOFTRAST_Texture_GetByIndex(index);
1305 if (index && !texture)
1307 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1311 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1312 command->unitnum = unitnum;
1313 command->texture = texture;
1315 dpsoftrast.texbound[unitnum] = texture;
1316 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1319 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1321 dpsoftrast.pointer_vertex3f = vertex3f;
1322 dpsoftrast.stride_vertex = stride;
1324 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1326 dpsoftrast.pointer_color4f = color4f;
1327 dpsoftrast.pointer_color4ub = NULL;
1328 dpsoftrast.stride_color = stride;
1330 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1332 dpsoftrast.pointer_color4f = NULL;
1333 dpsoftrast.pointer_color4ub = color4ub;
1334 dpsoftrast.stride_color = stride;
1336 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1338 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1339 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1340 dpsoftrast.stride_texcoord[unitnum] = stride;
1343 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1344 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1346 thread->shader_mode = command->mode;
1347 thread->shader_permutation = command->permutation;
1348 thread->shader_exactspecularmath = command->exactspecularmath;
1350 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1352 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1353 command->mode = mode;
1354 command->permutation = permutation;
1355 command->exactspecularmath = exactspecularmath;
1357 dpsoftrast.shader_mode = mode;
1358 dpsoftrast.shader_permutation = permutation;
1359 dpsoftrast.shader_exactspecularmath = exactspecularmath;
1362 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1363 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1365 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1367 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1369 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1370 command->index = index;
1371 command->val[0] = v0;
1372 command->val[1] = v1;
1373 command->val[2] = v2;
1374 command->val[3] = v3;
1376 dpsoftrast.uniform4f[index*4+0] = v0;
1377 dpsoftrast.uniform4f[index*4+1] = v1;
1378 dpsoftrast.uniform4f[index*4+2] = v2;
1379 dpsoftrast.uniform4f[index*4+3] = v3;
1381 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1383 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1384 command->index = index;
1385 memcpy(command->val, v, sizeof(command->val));
1387 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1390 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1391 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1393 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1395 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1399 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1401 __m128 m0, m1, m2, m3;
1402 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1403 command->index = (DPSOFTRAST_UNIFORM)index;
1404 if (((size_t)v)&(ALIGN_SIZE-1))
1406 m0 = _mm_loadu_ps(v);
1407 m1 = _mm_loadu_ps(v+4);
1408 m2 = _mm_loadu_ps(v+8);
1409 m3 = _mm_loadu_ps(v+12);
1413 m0 = _mm_load_ps(v);
1414 m1 = _mm_load_ps(v+4);
1415 m2 = _mm_load_ps(v+8);
1416 m3 = _mm_load_ps(v+12);
1420 __m128 t0, t1, t2, t3;
1421 t0 = _mm_unpacklo_ps(m0, m1);
1422 t1 = _mm_unpacklo_ps(m2, m3);
1423 t2 = _mm_unpackhi_ps(m0, m1);
1424 t3 = _mm_unpackhi_ps(m2, m3);
1425 m0 = _mm_movelh_ps(t0, t1);
1426 m1 = _mm_movehl_ps(t1, t0);
1427 m2 = _mm_movelh_ps(t2, t3);
1428 m3 = _mm_movehl_ps(t3, t2);
1430 _mm_store_ps(command->val, m0);
1431 _mm_store_ps(command->val+4, m1);
1432 _mm_store_ps(command->val+8, m2);
1433 _mm_store_ps(command->val+12, m3);
1434 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1435 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1436 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1437 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1442 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1443 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1445 thread->uniform1i[command->index] = command->val;
1447 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1449 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1450 command->index = index;
1453 dpsoftrast.uniform1i[command->index] = i0;
1456 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1457 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1459 memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1460 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1462 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1464 DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1465 command->clipplane[0] = x;
1466 command->clipplane[1] = y;
1467 command->clipplane[2] = z;
1468 command->clipplane[3] = w;
1472 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1474 float *end = dst + size*4;
1475 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1479 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1488 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1495 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1497 float *end = dst + size*4;
1498 if (stride == sizeof(float[3]))
1500 float *end4 = dst + (size&~3)*4;
1501 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1505 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1506 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1507 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1508 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1509 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1510 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1511 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1512 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1513 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1514 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1515 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1516 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1517 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1519 src += 4*sizeof(float[3]);
1526 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1527 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1528 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1529 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1530 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1531 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1532 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1533 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1534 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1535 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1536 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1537 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1538 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1540 src += 4*sizeof(float[3]);
1544 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1548 __m128 v = _mm_loadu_ps((const float *)src);
1549 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1550 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1551 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1552 _mm_store_ps(dst, v);
1561 __m128 v = _mm_load_ps((const float *)src);
1562 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1563 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1564 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1565 _mm_store_ps(dst, v);
1572 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1574 float *end = dst + size*4;
1575 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1576 if (stride == sizeof(float[2]))
1578 float *end2 = dst + (size&~1)*4;
1579 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1583 __m128 v = _mm_loadu_ps((const float *)src);
1584 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1585 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1587 src += 2*sizeof(float[2]);
1594 __m128 v = _mm_load_ps((const float *)src);
1595 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1596 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1598 src += 2*sizeof(float[2]);
1604 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1610 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1612 float *end = dst + size*4;
1613 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1614 if (stride == sizeof(unsigned char[4]))
1616 float *end4 = dst + (size&~3)*4;
1617 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1621 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1622 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1623 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1624 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1625 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1627 src += 4*sizeof(unsigned char[4]);
1634 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1635 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1636 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1637 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1638 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1640 src += 4*sizeof(unsigned char[4]);
1646 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1647 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1653 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1655 float *end = dst + 4*size;
1656 __m128 v = _mm_loadu_ps(src);
1659 _mm_store_ps(dst, v);
1665 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1668 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1669 __m128 m0, m1, m2, m3;
1671 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1673 // fast case for identity matrix
1674 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1677 end = out4f + numitems*4;
1678 m0 = _mm_loadu_ps(inmatrix16f);
1679 m1 = _mm_loadu_ps(inmatrix16f + 4);
1680 m2 = _mm_loadu_ps(inmatrix16f + 8);
1681 m3 = _mm_loadu_ps(inmatrix16f + 12);
1682 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1686 __m128 v = _mm_loadu_ps(in4f);
1688 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1689 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1690 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1691 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1700 __m128 v = _mm_load_ps(in4f);
1702 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1703 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1704 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1705 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1713 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1715 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1719 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1721 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1722 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1723 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1724 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1727 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1729 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1730 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1731 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1732 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1735 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1738 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1739 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1740 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1741 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1744 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1746 int clipmask = 0xFF;
1747 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1748 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1749 __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1750 __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1751 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1752 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1753 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1754 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1755 #define BBFRONT(k, pos) \
1757 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1758 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1759 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1762 clipmask &= ~(1<<k); \
1763 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1764 minproj = _mm_min_ss(minproj, proj); \
1765 maxproj = _mm_max_ss(maxproj, proj); \
1769 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1770 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1771 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1772 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1773 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1774 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1778 if (clipmask&(1<<k)) \
1780 if (!(clipmask&(1<<(k^1)))) \
1782 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1783 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1784 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1785 minproj = _mm_min_ss(minproj, proj); \
1786 maxproj = _mm_max_ss(maxproj, proj); \
1788 if (!(clipmask&(1<<(k^2)))) \
1790 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1791 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1792 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1793 minproj = _mm_min_ss(minproj, proj); \
1794 maxproj = _mm_max_ss(maxproj, proj); \
1796 if (!(clipmask&(1<<(k^4)))) \
1798 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1799 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1800 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1801 minproj = _mm_min_ss(minproj, proj); \
1802 maxproj = _mm_max_ss(maxproj, proj); \
1806 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1807 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1808 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1809 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1810 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1811 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1812 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1813 *starty = _mm_cvttss_si32(maxproj);
1814 *endy = _mm_cvttss_si32(minproj)+1;
1818 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1820 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1821 float *end = out4f + numitems*4;
1822 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1823 __m128 minpos, maxpos;
1824 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1826 minpos = maxpos = _mm_loadu_ps(in4f);
1829 __m128 v = _mm_loadu_ps(in4f);
1830 minpos = _mm_min_ps(minpos, v);
1831 maxpos = _mm_max_ps(maxpos, v);
1832 _mm_store_ps(out4f, v);
1833 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1834 _mm_store_ps(screen4f, v);
1842 minpos = maxpos = _mm_load_ps(in4f);
1845 __m128 v = _mm_load_ps(in4f);
1846 minpos = _mm_min_ps(minpos, v);
1847 maxpos = _mm_max_ps(maxpos, v);
1848 _mm_store_ps(out4f, v);
1849 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1850 _mm_store_ps(screen4f, v);
1858 ALIGN(float minposf[4]);
1859 ALIGN(float maxposf[4]);
1860 _mm_store_ps(minposf, minpos);
1861 _mm_store_ps(maxposf, maxpos);
1862 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1867 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1869 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1870 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1872 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1873 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1874 end = out4f + numitems*4;
1875 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1876 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1877 m0 = _mm_loadu_ps(inmatrix16f);
1878 m1 = _mm_loadu_ps(inmatrix16f + 4);
1879 m2 = _mm_loadu_ps(inmatrix16f + 8);
1880 m3 = _mm_loadu_ps(inmatrix16f + 12);
1881 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1883 minpos = maxpos = _mm_loadu_ps(in4f);
1886 __m128 v = _mm_loadu_ps(in4f);
1887 minpos = _mm_min_ps(minpos, v);
1888 maxpos = _mm_max_ps(maxpos, v);
1889 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1890 _mm_store_ps(out4f, v);
1891 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1892 _mm_store_ps(screen4f, v);
1900 minpos = maxpos = _mm_load_ps(in4f);
1903 __m128 v = _mm_load_ps(in4f);
1904 minpos = _mm_min_ps(minpos, v);
1905 maxpos = _mm_max_ps(maxpos, v);
1906 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1907 _mm_store_ps(out4f, v);
1908 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1909 _mm_store_ps(screen4f, v);
1917 ALIGN(float minposf[4]);
1918 ALIGN(float maxposf[4]);
1919 _mm_store_ps(minposf, minpos);
1920 _mm_store_ps(maxposf, maxpos);
1921 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f);
1927 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1930 float *outf = dpsoftrast.post_array4f[outarray];
1931 const unsigned char *inb;
1932 int firstvertex = dpsoftrast.firstvertex;
1933 int numvertices = dpsoftrast.numvertices;
1937 case DPSOFTRAST_ARRAY_POSITION:
1938 stride = dpsoftrast.stride_vertex;
1939 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1940 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1942 case DPSOFTRAST_ARRAY_COLOR:
1943 stride = dpsoftrast.stride_color;
1944 if (dpsoftrast.pointer_color4f)
1946 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1947 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1949 else if (dpsoftrast.pointer_color4ub)
1951 stride = dpsoftrast.stride_color;
1952 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1953 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1957 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1961 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1962 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1964 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1965 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1968 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1971 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1974 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1986 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1988 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1989 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1994 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1997 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1998 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
2006 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2009 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2010 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2017 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2020 int startx = span->startx;
2021 int endx = span->endx;
2022 float wslope = triangle->w[0];
2023 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2024 float endz = 1.0f / (w + wslope * startx);
2025 if (triangle->w[0] == 0)
2027 // LordHavoc: fast flat polygons (HUD/menu)
2028 for (x = startx;x < endx;x++)
2032 for (x = startx;x < endx;)
2034 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2036 if (nextsub >= endx) nextsub = endsub = endx-1;
2037 endz = 1.0f / (w + wslope * nextsub);
2038 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2039 for (; x <= endsub; x++, z += dz)
2044 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2048 int startx = span->startx;
2049 int endx = span->endx;
2052 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2053 unsigned char * RESTRICT pixelmask = span->pixelmask;
2054 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2055 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2058 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2059 pixeli += span->y * dpsoftrast.fb_width + span->x;
2060 // handle alphatest now (this affects depth writes too)
2061 if (thread->alphatest)
2062 for (x = startx;x < endx;x++)
2063 if (in4ub[x*4+3] < 128)
2064 pixelmask[x] = false;
2065 // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2066 // helps sprites, text and hud artwork
2067 switch(thread->fb_blendmode)
2069 case DPSOFTRAST_BLENDMODE_ALPHA:
2070 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2071 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2073 for (x = startx;x < endx;x++)
2075 if (in4ub[x*4+3] >= 1)
2080 while (++x < endx && in4ub[x*4+3] >= 1) ;
2082 if (x >= endx) break;
2084 while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2085 if (x >= endx) break;
2092 case DPSOFTRAST_BLENDMODE_OPAQUE:
2093 case DPSOFTRAST_BLENDMODE_ADD:
2094 case DPSOFTRAST_BLENDMODE_INVMOD:
2095 case DPSOFTRAST_BLENDMODE_MUL:
2096 case DPSOFTRAST_BLENDMODE_MUL2:
2097 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2098 case DPSOFTRAST_BLENDMODE_INVADD:
2101 // put some special values at the end of the mask to ensure the loops end
2102 pixelmask[endx] = 1;
2103 pixelmask[endx+1] = 0;
2104 // LordHavoc: use a double loop to identify subspans, this helps the
2105 // optimized copy/blend loops to perform at their best, most triangles
2106 // have only one run of pixels, and do the search using wide reads...
2110 // if this pixel is masked off, it's probably not alone...
2117 // the 4-item search must be aligned or else it stalls badly
2118 if ((x & 3) && !pixelmask[x])
2120 if(pixelmask[x]) goto endmasked;
2124 if(pixelmask[x]) goto endmasked;
2128 if(pixelmask[x]) goto endmasked;
2133 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2137 for (;!pixelmask[x];x++)
2139 // rather than continue the loop, just check the end variable
2144 // find length of subspan
2147 if (subx + 8 < endx)
2151 if(!pixelmask[subx]) goto endunmasked;
2155 if(!pixelmask[subx]) goto endunmasked;
2159 if(!pixelmask[subx]) goto endunmasked;
2164 while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2168 for (;pixelmask[subx];subx++)
2170 // the checks can overshoot, so make sure to clip it...
2174 // now that we know the subspan length... process!
2175 switch(thread->fb_blendmode)
2177 case DPSOFTRAST_BLENDMODE_OPAQUE:
2181 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2186 while (x + 16 <= subx)
2188 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2189 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2190 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2191 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2196 while (x + 4 <= subx)
2198 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2204 pixeli[x+1] = ini[x+1];
2214 case DPSOFTRAST_BLENDMODE_ALPHA:
2215 #define FINISHBLEND(blend2, blend1) \
2216 for (;x + 1 < subx;x += 2) \
2219 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2220 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2222 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2227 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2228 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2230 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2234 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2235 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2237 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2238 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2241 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2243 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2244 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2246 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2247 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2250 case DPSOFTRAST_BLENDMODE_ADD:
2251 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2253 case DPSOFTRAST_BLENDMODE_INVMOD:
2255 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2257 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2260 case DPSOFTRAST_BLENDMODE_MUL:
2261 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2263 case DPSOFTRAST_BLENDMODE_MUL2:
2264 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2266 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2268 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2269 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2271 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2272 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2275 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2277 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2278 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2280 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2281 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2284 case DPSOFTRAST_BLENDMODE_INVADD:
2286 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2288 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2296 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2299 int startx = span->startx;
2300 int endx = span->endx;
2305 float tc[2], endtc[2];
2307 unsigned int tci[2];
2308 unsigned int tci1[2];
2309 unsigned int tcimin[2];
2310 unsigned int tcimax[2];
2315 const unsigned char * RESTRICT pixelbase;
2316 const unsigned char * RESTRICT pixel[4];
2317 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2318 // if no texture is bound, just fill it with white
2321 for (x = startx;x < endx;x++)
2323 out4f[x*4+0] = 1.0f;
2324 out4f[x*4+1] = 1.0f;
2325 out4f[x*4+2] = 1.0f;
2326 out4f[x*4+3] = 1.0f;
2330 mip = triangle->mip[texunitindex];
2331 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2332 // if this mipmap of the texture is 1 pixel, just fill it with that color
2333 if (texture->mipmap[mip][1] == 4)
2335 c[0] = texture->bytes[2] * (1.0f/255.0f);
2336 c[1] = texture->bytes[1] * (1.0f/255.0f);
2337 c[2] = texture->bytes[0] * (1.0f/255.0f);
2338 c[3] = texture->bytes[3] * (1.0f/255.0f);
2339 for (x = startx;x < endx;x++)
2341 out4f[x*4+0] = c[0];
2342 out4f[x*4+1] = c[1];
2343 out4f[x*4+2] = c[2];
2344 out4f[x*4+3] = c[3];
2348 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2349 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2350 flags = texture->flags;
2351 tcscale[0] = texture->mipmap[mip][2];
2352 tcscale[1] = texture->mipmap[mip][3];
2353 tciwidth = texture->mipmap[mip][2];
2356 tcimax[0] = texture->mipmap[mip][2]-1;
2357 tcimax[1] = texture->mipmap[mip][3]-1;
2358 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2359 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2360 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2361 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2367 for (x = startx;x < endx;)
2369 unsigned int subtc[2];
2370 unsigned int substep[2];
2371 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2372 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2373 if (nextsub >= endx)
2375 nextsub = endsub = endx-1;
2376 if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2380 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2381 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2387 substep[0] = (endtc[0] - tc[0]) * subscale;
2388 substep[1] = (endtc[1] - tc[1]) * subscale;
2389 subtc[0] = tc[0] * (1<<12);
2390 subtc[1] = tc[1] * (1<<12);
2393 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2395 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2397 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2398 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2399 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2400 tci[0] = subtc[0]>>12;
2401 tci[1] = subtc[1]>>12;
2402 tci1[0] = tci[0] + 1;
2403 tci1[1] = tci[1] + 1;
2404 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2405 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2406 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2407 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2408 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2409 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2410 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2411 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2412 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2413 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2414 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2415 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2416 out4f[x*4+0] = c[0];
2417 out4f[x*4+1] = c[1];
2418 out4f[x*4+2] = c[2];
2419 out4f[x*4+3] = c[3];
2424 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2426 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2427 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2428 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2429 tci[0] = subtc[0]>>12;
2430 tci[1] = subtc[1]>>12;
2431 tci1[0] = tci[0] + 1;
2432 tci1[1] = tci[1] + 1;
2433 tci[0] &= tciwrapmask[0];
2434 tci[1] &= tciwrapmask[1];
2435 tci1[0] &= tciwrapmask[0];
2436 tci1[1] &= tciwrapmask[1];
2437 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2438 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2439 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2440 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2441 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2442 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2443 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2444 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2445 out4f[x*4+0] = c[0];
2446 out4f[x*4+1] = c[1];
2447 out4f[x*4+2] = c[2];
2448 out4f[x*4+3] = c[3];
2452 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2454 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2456 tci[0] = subtc[0]>>12;
2457 tci[1] = subtc[1]>>12;
2458 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2459 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2460 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2461 c[0] = pixel[0][2] * (1.0f / 255.0f);
2462 c[1] = pixel[0][1] * (1.0f / 255.0f);
2463 c[2] = pixel[0][0] * (1.0f / 255.0f);
2464 c[3] = pixel[0][3] * (1.0f / 255.0f);
2465 out4f[x*4+0] = c[0];
2466 out4f[x*4+1] = c[1];
2467 out4f[x*4+2] = c[2];
2468 out4f[x*4+3] = c[3];
2473 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2475 tci[0] = subtc[0]>>12;
2476 tci[1] = subtc[1]>>12;
2477 tci[0] &= tciwrapmask[0];
2478 tci[1] &= tciwrapmask[1];
2479 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2480 c[0] = pixel[0][2] * (1.0f / 255.0f);
2481 c[1] = pixel[0][1] * (1.0f / 255.0f);
2482 c[2] = pixel[0][0] * (1.0f / 255.0f);
2483 c[3] = pixel[0][3] * (1.0f / 255.0f);
2484 out4f[x*4+0] = c[0];
2485 out4f[x*4+1] = c[1];
2486 out4f[x*4+2] = c[2];
2487 out4f[x*4+3] = c[3];
2493 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2497 int startx = span->startx;
2498 int endx = span->endx;
2500 __m128 data, slope, tcscale;
2501 __m128i tcsize, tcmask, tcoffset, tcmax;
2503 __m128i subtc, substep, endsubtc;
2506 int affine; // LordHavoc: optimized affine texturing case
2507 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2508 const unsigned char * RESTRICT pixelbase;
2509 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2510 // if no texture is bound, just fill it with white
2513 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2516 mip = triangle->mip[texunitindex];
2517 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2518 // if this mipmap of the texture is 1 pixel, just fill it with that color
2519 if (texture->mipmap[mip][1] == 4)
2521 unsigned int k = *((const unsigned int *)pixelbase);
2522 for (x = startx;x < endx;x++)
2526 affine = zf[startx] == zf[endx-1];
2527 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2528 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2529 flags = texture->flags;
2530 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2531 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2532 tcscale = _mm_cvtepi32_ps(tcsize);
2533 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2534 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2535 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2537 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2538 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2539 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2540 tcmax = _mm_packs_epi32(tcmask, tcmask);
2541 for (x = startx;x < endx;)
2543 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2544 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2545 if (nextsub >= endx || affine)
2547 nextsub = endsub = endx-1;
2548 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2552 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2554 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2555 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2556 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2557 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2558 substep = _mm_slli_epi32(substep, 1);
2561 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2562 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2564 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2565 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2567 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2568 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2569 tci = _mm_madd_epi16(tci, tcoffset);
2570 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2571 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2572 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2573 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2574 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2575 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2576 fracm = _mm_srli_epi16(subtc, 1);
2577 pix1 = _mm_add_epi16(pix1,
2578 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2579 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2580 pix3 = _mm_add_epi16(pix3,
2581 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2582 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2583 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2584 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2585 pix2 = _mm_add_epi16(pix2,
2586 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2587 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2588 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2592 const unsigned char * RESTRICT ptr1;
2593 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2594 tci = _mm_madd_epi16(tci, tcoffset);
2595 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2596 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2597 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2598 fracm = _mm_srli_epi16(subtc, 1);
2599 pix1 = _mm_add_epi16(pix1,
2600 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2601 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2602 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2603 pix1 = _mm_add_epi16(pix1,
2604 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2605 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2606 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2610 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2612 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2614 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2615 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2616 tci = _mm_madd_epi16(tci, tcoffset);
2617 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2618 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2619 _mm_setzero_si128());
2620 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2621 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2622 _mm_setzero_si128());
2623 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2624 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2625 tci = _mm_madd_epi16(tci, tcoffset);
2626 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2627 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2628 _mm_setzero_si128());
2629 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2630 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2631 _mm_setzero_si128());
2632 fracm = _mm_srli_epi16(subtc, 1);
2633 pix1 = _mm_add_epi16(pix1,
2634 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2635 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2636 pix3 = _mm_add_epi16(pix3,
2637 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2638 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2639 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2640 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2641 pix2 = _mm_add_epi16(pix2,
2642 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2643 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2644 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2648 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2649 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2650 tci = _mm_madd_epi16(tci, tcoffset);
2651 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2652 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2653 _mm_setzero_si128());
2654 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2655 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2656 _mm_setzero_si128());
2657 fracm = _mm_srli_epi16(subtc, 1);
2658 pix1 = _mm_add_epi16(pix1,
2659 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2660 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2661 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2662 pix1 = _mm_add_epi16(pix1,
2663 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2664 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2665 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2671 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2673 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2674 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2675 tci = _mm_madd_epi16(tci, tcoffset);
2676 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2677 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2678 _mm_setzero_si128());
2679 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2680 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2681 _mm_setzero_si128());
2682 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2683 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2684 tci = _mm_madd_epi16(tci, tcoffset);
2685 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2686 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2687 _mm_setzero_si128());
2688 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2689 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2690 _mm_setzero_si128());
2691 fracm = _mm_srli_epi16(subtc, 1);
2692 pix1 = _mm_add_epi16(pix1,
2693 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2694 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2695 pix3 = _mm_add_epi16(pix3,
2696 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2697 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2698 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2699 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2700 pix2 = _mm_add_epi16(pix2,
2701 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2702 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2703 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2707 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2708 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2709 tci = _mm_madd_epi16(tci, tcoffset);
2710 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2711 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2712 _mm_setzero_si128());
2713 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2714 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2715 _mm_setzero_si128());
2716 fracm = _mm_srli_epi16(subtc, 1);
2717 pix1 = _mm_add_epi16(pix1,
2718 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2719 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2720 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2721 pix1 = _mm_add_epi16(pix1,
2722 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2723 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2724 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2731 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2733 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2735 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2736 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2737 tci = _mm_madd_epi16(tci, tcoffset);
2738 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2739 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2743 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2744 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2745 tci = _mm_madd_epi16(tci, tcoffset);
2746 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2752 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2754 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2755 tci = _mm_and_si128(tci, tcmax);
2756 tci = _mm_madd_epi16(tci, tcoffset);
2757 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2758 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2762 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2763 tci = _mm_and_si128(tci, tcmax);
2764 tci = _mm_madd_epi16(tci, tcoffset);
2765 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2774 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2777 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2780 float DPSOFTRAST_SampleShadowmap(const float *vector)
2786 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2789 int startx = span->startx;
2790 int endx = span->endx;
2795 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2796 for (x = startx;x < endx;x++)
2799 c[0] = (data[0] + slope[0]*x) * z;
2800 c[1] = (data[1] + slope[1]*x) * z;
2801 c[2] = (data[2] + slope[2]*x) * z;
2802 c[3] = (data[3] + slope[3]*x) * z;
2803 out4f[x*4+0] = in4f[x*4+0] * c[0];
2804 out4f[x*4+1] = in4f[x*4+1] * c[1];
2805 out4f[x*4+2] = in4f[x*4+2] * c[2];
2806 out4f[x*4+3] = in4f[x*4+3] * c[3];
2810 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2813 int startx = span->startx;
2814 int endx = span->endx;
2819 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2820 for (x = startx;x < endx;x++)
2823 c[0] = (data[0] + slope[0]*x) * z;
2824 c[1] = (data[1] + slope[1]*x) * z;
2825 c[2] = (data[2] + slope[2]*x) * z;
2826 c[3] = (data[3] + slope[3]*x) * z;
2827 out4f[x*4+0] = c[0];
2828 out4f[x*4+1] = c[1];
2829 out4f[x*4+2] = c[2];
2830 out4f[x*4+3] = c[3];
2834 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2836 int x, startx = span->startx, endx = span->endx;
2837 float c[4], localcolor[4];
2838 localcolor[0] = subcolor[0];
2839 localcolor[1] = subcolor[1];
2840 localcolor[2] = subcolor[2];
2841 localcolor[3] = subcolor[3];
2842 for (x = startx;x < endx;x++)
2844 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2845 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2846 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2847 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2848 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2849 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2850 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2851 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2855 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2857 int x, startx = span->startx, endx = span->endx;
2858 for (x = startx;x < endx;x++)
2860 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2861 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2862 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2863 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2867 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2869 int x, startx = span->startx, endx = span->endx;
2870 for (x = startx;x < endx;x++)
2872 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2873 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2874 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2875 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2879 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2881 int x, startx = span->startx, endx = span->endx;
2883 for (x = startx;x < endx;x++)
2885 a = 1.0f - inb4f[x*4+3];
2887 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2888 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2889 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2890 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2894 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2896 int x, startx = span->startx, endx = span->endx;
2897 float localcolor[4], ilerp, lerp;
2898 localcolor[0] = color[0];
2899 localcolor[1] = color[1];
2900 localcolor[2] = color[2];
2901 localcolor[3] = color[3];
2902 ilerp = 1.0f - localcolor[3];
2903 lerp = localcolor[3];
2904 for (x = startx;x < endx;x++)
2906 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2907 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2908 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2909 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2915 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2919 int startx = span->startx;
2920 int endx = span->endx;
2923 __m128i submod, substep, endsubmod;
2924 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2925 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2926 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2927 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2928 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2929 for (x = startx; x < endx;)
2931 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2932 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2933 if (nextsub >= endx)
2935 nextsub = endsub = endx-1;
2936 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2940 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2941 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2942 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2943 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2944 substep = _mm_packs_epi32(substep, substep);
2945 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2947 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2948 pix = _mm_mulhi_epu16(pix, submod);
2949 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2953 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2954 pix = _mm_mulhi_epu16(pix, submod);
2955 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2962 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2966 int startx = span->startx;
2967 int endx = span->endx;
2970 __m128i submod, substep, endsubmod;
2971 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2972 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2973 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2974 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2975 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2976 for (x = startx; x < endx;)
2978 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2979 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2980 if (nextsub >= endx)
2982 nextsub = endsub = endx-1;
2983 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2987 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2988 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2989 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2990 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2991 substep = _mm_packs_epi32(substep, substep);
2992 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2994 __m128i pix = _mm_srai_epi16(submod, 4);
2995 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2999 __m128i pix = _mm_srai_epi16(submod, 4);
3000 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3007 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3010 int x, startx = span->startx, endx = span->endx;
3011 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3012 localcolor = _mm_packs_epi32(localcolor, localcolor);
3013 for (x = startx;x+2 <= endx;x+=2)
3015 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3016 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3017 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3018 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3022 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3023 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3024 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3025 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3030 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3033 int x, startx = span->startx, endx = span->endx;
3034 for (x = startx;x+2 <= endx;x+=2)
3036 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3037 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3038 pix1 = _mm_mulhi_epu16(pix1, pix2);
3039 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3043 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3044 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3045 pix1 = _mm_mulhi_epu16(pix1, pix2);
3046 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3051 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3054 int x, startx = span->startx, endx = span->endx;
3055 for (x = startx;x+2 <= endx;x+=2)
3057 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3058 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3059 pix1 = _mm_add_epi16(pix1, pix2);
3060 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3064 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3065 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3066 pix1 = _mm_add_epi16(pix1, pix2);
3067 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3072 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3075 int x, startx = span->startx, endx = span->endx;
3076 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3077 tint = _mm_packs_epi32(tint, tint);
3078 for (x = startx;x+2 <= endx;x+=2)
3080 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3081 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3082 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3083 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3087 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3088 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3089 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3090 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3095 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3098 int x, startx = span->startx, endx = span->endx;
3099 for (x = startx;x+2 <= endx;x+=2)
3101 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3102 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3103 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3104 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3105 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3109 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3110 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3111 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3112 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3113 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3118 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3121 int x, startx = span->startx, endx = span->endx;
3122 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3123 localcolor = _mm_packs_epi32(localcolor, localcolor);
3124 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3125 for (x = startx;x+2 <= endx;x+=2)
3127 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3128 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3129 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3133 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3134 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3135 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3142 void DPSOFTRAST_VertexShader_Generic(void)
3144 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3145 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3146 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3147 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3148 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3151 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3153 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3154 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3155 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3156 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3157 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3158 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3160 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3161 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3162 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3164 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3165 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3168 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3170 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3173 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3175 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3178 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3183 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3184 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3189 void DPSOFTRAST_VertexShader_PostProcess(void)
3191 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3192 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3193 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3196 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3198 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3199 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3200 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3201 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3202 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3203 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3204 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3206 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3207 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3209 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3210 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3212 // TODO: implement saturation
3214 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3216 // TODO: implement gammaramps
3218 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3223 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3225 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3228 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3230 // this is never called (because colormask is off when this shader is used)
3231 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3232 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3233 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3234 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3235 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3240 void DPSOFTRAST_VertexShader_FlatColor(void)
3242 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3243 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3246 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3249 unsigned char * RESTRICT pixelmask = span->pixelmask;
3250 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3251 int x, startx = span->startx, endx = span->endx;
3252 __m128i Color_Ambientm;
3253 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3254 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3255 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3256 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3257 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3258 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3259 pixel = buffer_FragColorbgra8;
3260 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3261 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3262 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3263 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3264 for (x = startx;x < endx;x++)
3267 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3270 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3271 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3272 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3273 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3279 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3280 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3281 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3283 if (pixel == buffer_FragColorbgra8)
3284 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3290 void DPSOFTRAST_VertexShader_VertexColor(void)
3292 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3293 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3294 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3297 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3300 unsigned char * RESTRICT pixelmask = span->pixelmask;
3301 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3302 int x, startx = span->startx, endx = span->endx;
3303 __m128i Color_Ambientm, Color_Diffusem;
3305 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3306 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3307 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3308 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3309 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3310 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3311 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3312 pixel = buffer_FragColorbgra8;
3313 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3314 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3315 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3316 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3317 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3318 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3319 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3320 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3321 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3322 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3323 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3324 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3325 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3326 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3328 __m128i color, mod, pix;
3329 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3332 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3333 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3334 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3335 data = _mm_add_ps(data, slope);
3336 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3337 data = _mm_add_ps(data, slope);
3338 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3339 data = _mm_add_ps(data, slope);
3340 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3341 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3342 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3343 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3344 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3345 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3351 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3352 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3353 mod = _mm_packs_epi32(mod, mod);
3354 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3355 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3357 if (pixel == buffer_FragColorbgra8)
3358 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3364 void DPSOFTRAST_VertexShader_Lightmap(void)
3366 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3367 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3368 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3371 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3374 unsigned char * RESTRICT pixelmask = span->pixelmask;
3375 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3376 int x, startx = span->startx, endx = span->endx;
3377 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3378 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3379 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3380 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3381 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3382 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3383 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3384 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3385 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3386 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3387 pixel = buffer_FragColorbgra8;
3388 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3389 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3390 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3391 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3392 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3393 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3394 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3395 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3397 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3398 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3399 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3400 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3401 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3402 for (x = startx;x < endx;x++)
3404 __m128i color, lightmap, glow, pix;
3405 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3408 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3409 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3410 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3411 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3412 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3413 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3414 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3415 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3416 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3417 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3423 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3424 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3425 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3426 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3427 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3428 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3433 for (x = startx;x < endx;x++)
3435 __m128i color, lightmap, pix;
3436 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3439 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3440 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3441 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3442 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3443 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3444 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3445 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3451 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3452 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3453 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3454 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3457 if (pixel == buffer_FragColorbgra8)
3458 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3463 void DPSOFTRAST_VertexShader_LightDirection(void);
3464 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3466 void DPSOFTRAST_VertexShader_FakeLight(void)
3468 DPSOFTRAST_VertexShader_LightDirection();
3471 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3473 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3478 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3480 DPSOFTRAST_VertexShader_LightDirection();
3481 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3484 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3486 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3491 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3493 DPSOFTRAST_VertexShader_LightDirection();
3494 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3497 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3499 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3504 void DPSOFTRAST_VertexShader_LightDirection(void)
3507 int numvertices = dpsoftrast.numvertices;
3509 float LightVector[4];
3510 float EyePosition[4];
3511 float EyeVectorModelSpace[4];
3517 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3518 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3519 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3520 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3521 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3522 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3523 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3524 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3525 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3526 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3527 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3528 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3529 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3530 for (i = 0;i < numvertices;i++)
3532 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3533 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3534 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3535 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3536 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3537 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3538 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3539 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3540 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3541 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3542 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3543 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3544 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3545 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3546 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3547 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3548 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3549 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3550 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3551 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3552 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3553 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3554 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3555 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3556 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3557 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3558 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3559 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3560 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3562 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3565 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3566 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3567 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3568 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3569 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3570 #define DPSOFTRAST_Vector3Normalize(v)\
3573 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3584 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3586 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3587 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3588 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3589 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3590 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3591 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3592 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3593 unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3594 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3595 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3596 int x, startx = span->startx, endx = span->endx;
3597 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3598 float LightVectordata[4];
3599 float LightVectorslope[4];
3600 float EyeVectordata[4];
3601 float EyeVectorslope[4];
3602 float VectorSdata[4];
3603 float VectorSslope[4];
3604 float VectorTdata[4];
3605 float VectorTslope[4];
3606 float VectorRdata[4];
3607 float VectorRslope[4];
3609 float diffusetex[4];
3611 float surfacenormal[4];
3612 float lightnormal[4];
3613 float lightnormal_modelspace[4];
3615 float specularnormal[4];
3618 float SpecularPower;
3620 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3621 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3622 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3623 Color_Glow[3] = 0.0f;
3624 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3625 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3626 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3627 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3628 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3629 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3630 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3631 Color_Pants[3] = 0.0f;
3632 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3633 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3634 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3635 Color_Shirt[3] = 0.0f;
3636 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3637 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3638 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3640 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3641 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3643 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3645 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3647 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3649 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3650 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3651 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3652 Color_Diffuse[3] = 0.0f;
3653 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3654 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3655 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3656 LightColor[3] = 0.0f;
3657 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3658 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3659 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3660 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3661 Color_Specular[3] = 0.0f;
3662 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3663 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3664 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3666 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3668 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3669 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3670 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3671 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3672 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3674 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3676 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3677 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3679 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3681 // nothing of this needed
3685 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3688 for (x = startx;x < endx;x++)
3691 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3692 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3693 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3694 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3695 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3697 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3698 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3699 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3700 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3702 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3703 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3704 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3705 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3706 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3707 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3708 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3709 DPSOFTRAST_Vector3Normalize(surfacenormal);
3711 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3713 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3714 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3715 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3716 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3718 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3719 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3720 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3721 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3723 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3724 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3725 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3726 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3728 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3729 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3730 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3731 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3733 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3734 DPSOFTRAST_Vector3Normalize(lightnormal);
3736 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3738 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3739 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3740 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3741 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3744 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3746 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3747 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3748 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3750 float f = 1.0f / 256.0f;
3751 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3752 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3753 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3756 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3758 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3759 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3760 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3761 DPSOFTRAST_Vector3Normalize(lightnormal);
3763 LightColor[0] = 1.0;
3764 LightColor[1] = 1.0;
3765 LightColor[2] = 1.0;
3769 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3770 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3771 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3772 DPSOFTRAST_Vector3Normalize(lightnormal);
3775 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3777 if(thread->shader_exactspecularmath)
3779 // reflect lightnormal at surfacenormal, take the negative of that
3780 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3782 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3783 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3784 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3785 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3787 // dot of this and normalize(EyeVectorFogDepth.xyz)
3788 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3789 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3790 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3791 DPSOFTRAST_Vector3Normalize(eyenormal);
3793 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3797 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3798 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3799 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3800 DPSOFTRAST_Vector3Normalize(eyenormal);
3802 specularnormal[0] = lightnormal[0] + eyenormal[0];
3803 specularnormal[1] = lightnormal[1] + eyenormal[1];
3804 specularnormal[2] = lightnormal[2] + eyenormal[2];
3805 DPSOFTRAST_Vector3Normalize(specularnormal);
3807 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3810 specular = pow(specular, SpecularPower * glosstex[3]);
3811 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3813 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3814 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3815 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3816 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3820 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3821 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3822 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3823 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3826 buffer_FragColorbgra8[x*4+0] = d[0];
3827 buffer_FragColorbgra8[x*4+1] = d[1];
3828 buffer_FragColorbgra8[x*4+2] = d[2];
3829 buffer_FragColorbgra8[x*4+3] = d[3];
3832 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3834 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3835 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3836 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3837 Color_Diffuse[3] = 0.0f;
3838 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3839 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3840 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3841 LightColor[3] = 0.0f;
3842 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3844 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3846 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3847 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3848 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3849 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3850 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3852 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3854 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3855 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3857 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3859 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3863 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3866 for (x = startx;x < endx;x++)
3869 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3870 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3871 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3872 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3873 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3874 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3875 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3876 DPSOFTRAST_Vector3Normalize(surfacenormal);
3878 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3880 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3881 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3882 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3883 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3885 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3886 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3887 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3888 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3890 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3891 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3892 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3893 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3895 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3896 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3897 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3898 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3900 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3901 DPSOFTRAST_Vector3Normalize(lightnormal);
3903 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3905 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3906 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3907 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3908 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3911 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3913 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3914 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3915 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3917 float f = 1.0f / 256.0f;
3918 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3919 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3920 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3923 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3925 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3926 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3927 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3928 DPSOFTRAST_Vector3Normalize(lightnormal);
3930 LightColor[0] = 1.0;
3931 LightColor[1] = 1.0;
3932 LightColor[2] = 1.0;
3936 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3937 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3938 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3939 DPSOFTRAST_Vector3Normalize(lightnormal);
3942 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3943 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3945 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3946 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3947 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3948 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3952 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3953 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3954 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3955 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3957 buffer_FragColorbgra8[x*4+0] = d[0];
3958 buffer_FragColorbgra8[x*4+1] = d[1];
3959 buffer_FragColorbgra8[x*4+2] = d[2];
3960 buffer_FragColorbgra8[x*4+3] = d[3];
3965 for (x = startx;x < endx;x++)
3968 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3969 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3970 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3971 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3973 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3975 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3976 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3977 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3978 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3982 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3983 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3984 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3985 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3987 buffer_FragColorbgra8[x*4+0] = d[0];
3988 buffer_FragColorbgra8[x*4+1] = d[1];
3989 buffer_FragColorbgra8[x*4+2] = d[2];
3990 buffer_FragColorbgra8[x*4+3] = d[3];
3993 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3998 void DPSOFTRAST_VertexShader_LightSource(void)
4001 int numvertices = dpsoftrast.numvertices;
4002 float LightPosition[4];
4003 float LightVector[4];
4004 float LightVectorModelSpace[4];
4005 float EyePosition[4];
4006 float EyeVectorModelSpace[4];
4012 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4013 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4014 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4015 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4016 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4017 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4018 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4019 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4020 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4021 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4022 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4023 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4024 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4025 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4026 for (i = 0;i < numvertices;i++)
4028 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4029 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4030 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4031 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4032 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4033 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4034 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4035 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4036 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4037 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4038 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4039 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4040 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4041 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4042 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4043 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4044 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4045 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
4046 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4047 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4048 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4049 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4050 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4051 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4052 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4053 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4054 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4055 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4056 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4057 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4058 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4059 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4061 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4062 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4065 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4068 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4069 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4070 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4071 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4072 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4073 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4074 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4075 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4076 int x, startx = span->startx, endx = span->endx;
4077 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4078 float CubeVectordata[4];
4079 float CubeVectorslope[4];
4080 float LightVectordata[4];
4081 float LightVectorslope[4];
4082 float EyeVectordata[4];
4083 float EyeVectorslope[4];
4085 float diffusetex[4];
4087 float surfacenormal[4];
4088 float lightnormal[4];
4090 float specularnormal[4];
4093 float SpecularPower;
4094 float CubeVector[4];
4097 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4098 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4099 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4100 Color_Glow[3] = 0.0f;
4101 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4102 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4103 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4104 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4105 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4106 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4107 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4108 Color_Diffuse[3] = 0.0f;
4109 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4110 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4111 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4112 Color_Specular[3] = 0.0f;
4113 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4114 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4115 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4116 Color_Pants[3] = 0.0f;
4117 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4118 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4119 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4120 Color_Shirt[3] = 0.0f;
4121 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4122 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4123 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4124 LightColor[3] = 0.0f;
4125 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4126 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4127 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4128 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4129 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4130 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4131 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4132 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4134 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4135 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4137 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4138 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4139 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4141 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4142 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4143 for (x = startx;x < endx;x++)
4146 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4147 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4148 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4149 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4150 if (attenuation < 0.01f)
4152 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4154 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4155 if (attenuation < 0.01f)
4159 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4160 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4161 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4162 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4163 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4165 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4166 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4167 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4168 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4170 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4171 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4172 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4173 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4174 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4175 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4176 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4177 DPSOFTRAST_Vector3Normalize(surfacenormal);
4179 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4180 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4181 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4182 DPSOFTRAST_Vector3Normalize(lightnormal);
4184 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4186 if(thread->shader_exactspecularmath)
4188 // reflect lightnormal at surfacenormal, take the negative of that
4189 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4191 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4192 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4193 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4194 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4196 // dot of this and normalize(EyeVectorFogDepth.xyz)
4197 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4198 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4199 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4200 DPSOFTRAST_Vector3Normalize(eyenormal);
4202 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4206 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4207 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4208 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4209 DPSOFTRAST_Vector3Normalize(eyenormal);
4211 specularnormal[0] = lightnormal[0] + eyenormal[0];
4212 specularnormal[1] = lightnormal[1] + eyenormal[1];
4213 specularnormal[2] = lightnormal[2] + eyenormal[2];
4214 DPSOFTRAST_Vector3Normalize(specularnormal);
4216 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4218 specular = pow(specular, SpecularPower * glosstex[3]);
4220 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4222 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4223 attenuation *= (1.0f / 255.0f);
4224 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4225 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4226 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4227 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4231 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4232 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4233 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4234 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4236 buffer_FragColorbgra8[x*4+0] = d[0];
4237 buffer_FragColorbgra8[x*4+1] = d[1];
4238 buffer_FragColorbgra8[x*4+2] = d[2];
4239 buffer_FragColorbgra8[x*4+3] = d[3];
4242 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4244 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4245 for (x = startx;x < endx;x++)
4248 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4249 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4250 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4251 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4252 if (attenuation < 0.01f)
4254 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4256 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4257 if (attenuation < 0.01f)
4261 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4262 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4263 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4264 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4265 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4267 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4268 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4269 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4270 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4272 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4273 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4274 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4275 DPSOFTRAST_Vector3Normalize(surfacenormal);
4277 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4278 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4279 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4280 DPSOFTRAST_Vector3Normalize(lightnormal);
4282 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4283 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4285 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4286 attenuation *= (1.0f / 255.0f);
4287 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4288 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4289 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4290 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4294 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4295 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4296 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4297 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4299 buffer_FragColorbgra8[x*4+0] = d[0];
4300 buffer_FragColorbgra8[x*4+1] = d[1];
4301 buffer_FragColorbgra8[x*4+2] = d[2];
4302 buffer_FragColorbgra8[x*4+3] = d[3];
4307 for (x = startx;x < endx;x++)
4310 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4311 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4312 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4313 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4314 if (attenuation < 0.01f)
4316 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4318 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4319 if (attenuation < 0.01f)
4323 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4324 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4325 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4326 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4327 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4329 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4330 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4331 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4332 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4334 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4336 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4337 attenuation *= (1.0f / 255.0f);
4338 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4339 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4340 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4341 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4345 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4346 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4347 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4348 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4350 buffer_FragColorbgra8[x*4+0] = d[0];
4351 buffer_FragColorbgra8[x*4+1] = d[1];
4352 buffer_FragColorbgra8[x*4+2] = d[2];
4353 buffer_FragColorbgra8[x*4+3] = d[3];
4356 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4362 void DPSOFTRAST_VertexShader_Refraction(void)
4364 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4365 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4366 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4369 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4371 // DIRTY TRICK: only do sideways displacement. Not correct, but cheaper and thus better for SW.
4373 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4375 int x, startx = span->startx, endx = span->endx;
4378 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4379 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4382 float ModelViewProjectionPositiondata[4];
4383 float ModelViewProjectionPositionslope[4];
4386 float ScreenScaleRefractReflect[2];
4387 float ScreenCenterRefractReflect[2];
4388 float DistortScaleRefractReflect[2];
4389 float RefractColor[4];
4391 const unsigned char * RESTRICT pixelbase;
4392 const unsigned char * RESTRICT pixel[4];
4393 DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4394 if(!texture) return;
4395 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[0][0];
4398 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4399 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4402 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
4405 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4406 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4407 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4408 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4409 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4410 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4411 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4412 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4413 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4414 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4417 for (x = startx;x < endx;x++)
4419 float SafeScreenTexCoord[2];
4420 float ScreenTexCoord[2];
4427 // " vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4428 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4430 // " vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4431 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4432 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4434 // " vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4435 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4436 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4437 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4438 DPSOFTRAST_Vector3Normalize(v);
4439 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4440 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4442 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4443 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4445 unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<12) - 2048, ScreenTexCoord[1] * (texture->mipmap[0][3]<<12) - 2048};
4446 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
4447 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
4448 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
4449 int tci[2] = { tc[0]>>12, tc[1]>>12 };
4450 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4451 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4452 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4453 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4454 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4455 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4456 pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci1[0]);
4457 pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci[0]);
4458 pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci1[0]);
4459 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
4460 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
4461 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
4465 int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2], ScreenTexCoord[1] * texture->mipmap[0][3] };
4466 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4467 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4468 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4474 //p = (int) bound(startx, x + (ScreenTexCoord[0] - SafeScreenTexCoord[0]) / (ModelViewProjectionPositionslope[0]*z), endx-1);
4475 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4476 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4477 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4478 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4481 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4486 void DPSOFTRAST_VertexShader_Water(void)
4488 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4492 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4495 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4496 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4497 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4498 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4499 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4504 void DPSOFTRAST_VertexShader_ShowDepth(void)
4506 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4509 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4512 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4513 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4514 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4515 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4516 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4521 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4523 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4526 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4529 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4530 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4531 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4532 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4533 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4538 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4540 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4543 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4546 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4547 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4548 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4549 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4550 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4555 typedef struct DPSOFTRAST_ShaderModeInfo_s
4558 void (*Vertex)(void);
4559 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4560 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4561 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4563 DPSOFTRAST_ShaderModeInfo;
4565 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4567 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4568 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4569 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4570 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4571 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4572 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4573 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4574 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4575 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4576 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4577 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4578 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4579 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4580 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4581 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4582 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4585 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4590 unsigned int *depthpixel;
4594 unsigned char *pixelmask;
4595 DPSOFTRAST_State_Triangle *triangle;
4596 triangle = &thread->triangles[span->triangle];
4597 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4598 startx = span->startx;
4600 depth = span->depthbase;
4601 depthslope = span->depthslope;
4602 pixelmask = thread->pixelmaskarray;
4603 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4605 switch(thread->fb_depthfunc)
4608 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4609 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4610 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4611 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4612 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4613 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4614 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4616 while (startx < endx && !pixelmask[startx])
4618 while (endx > startx && !pixelmask[endx-1])
4623 // no depth testing means we're just dealing with color...
4624 memset(pixelmask + startx, 1, endx - startx);
4626 span->pixelmask = pixelmask;
4627 span->startx = startx;
4631 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4633 int x, d, depth, depthslope, startx, endx;
4634 const unsigned char *pixelmask;
4635 unsigned int *depthpixel;
4636 if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4638 depth = span->depthbase;
4639 depthslope = span->depthslope;
4640 pixelmask = span->pixelmask;
4641 startx = span->startx;
4643 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4644 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4650 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4653 DPSOFTRAST_State_Triangle *triangle;
4654 DPSOFTRAST_State_Span *span;
4655 for (i = 0; i < thread->numspans; i++)
4657 span = &thread->spans[i];
4658 triangle = &thread->triangles[span->triangle];
4659 DPSOFTRAST_Draw_DepthTest(thread, span);
4660 if (span->startx >= span->endx)
4662 // run pixel shader if appropriate
4663 // do this before running depthmask code, to allow the pixelshader
4664 // to clear pixelmask values for alpha testing
4665 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4666 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4667 DPSOFTRAST_Draw_DepthWrite(thread, span);
4669 thread->numspans = 0;
4672 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4674 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4677 int cullface = thread->cullface;
4678 int minx, maxx, miny, maxy;
4679 int miny1, maxy1, miny2, maxy2;
4680 __m128i fbmin, fbmax;
4681 __m128 viewportcenter, viewportscale;
4682 int firstvertex = command->firstvertex;
4683 int numvertices = command->numvertices;
4684 int numtriangles = command->numtriangles;
4685 const int *element3i = command->element3i;
4686 const unsigned short *element3s = command->element3s;
4687 int clipped = command->clipped;
4694 int starty, endy, bandy;
4698 float clip0origin, clip0slope;
4700 __m128 triangleedge1, triangleedge2, trianglenormal;
4703 DPSOFTRAST_State_Triangle *triangle;
4704 DPSOFTRAST_Texture *texture;
4705 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4706 miny = thread->fb_scissor[1];
4707 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4708 miny1 = bound(miny, thread->miny1, maxy);
4709 maxy1 = bound(miny, thread->maxy1, maxy);
4710 miny2 = bound(miny, thread->miny2, maxy);
4711 maxy2 = bound(miny, thread->maxy2, maxy);
4712 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4714 if (!ATOMIC_DECREMENT(command->refcount))
4716 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4717 MM_FREE(command->arrays);
4721 minx = thread->fb_scissor[0];
4722 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4723 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4724 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4725 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4726 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4727 screen[3] = _mm_setzero_ps();
4728 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4729 for (i = 0;i < numtriangles;i++)
4731 const float *screencoord4f = command->arrays;
4732 const float *arrays = screencoord4f + numvertices*4;
4734 // generate the 3 edges of this triangle
4735 // generate spans for the triangle - switch based on left split or right split classification of triangle
4738 e[0] = element3s[i*3+0] - firstvertex;
4739 e[1] = element3s[i*3+1] - firstvertex;
4740 e[2] = element3s[i*3+2] - firstvertex;
4744 e[0] = element3i[i*3+0] - firstvertex;
4745 e[1] = element3i[i*3+1] - firstvertex;
4746 e[2] = element3i[i*3+2] - firstvertex;
4755 #define SKIPBACKFACE \
4756 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4757 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4758 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4759 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4760 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4764 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4768 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4773 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4774 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4776 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4777 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4779 #define CLIPPEDVERTEXCOPY(k,p1) \
4780 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4782 #define GENATTRIBCOPY(attrib, p1) \
4783 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4784 #define GENATTRIBLERP(attrib, p1, p2) \
4786 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4787 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4789 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4793 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4794 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4795 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4796 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4797 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4798 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4799 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4805 // calculate distance from nearplane
4806 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4807 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4808 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4809 if (clipdist[0] >= 0.0f)
4811 if (clipdist[1] >= 0.0f)
4813 if (clipdist[2] >= 0.0f)
4816 // triangle is entirely in front of nearplane
4817 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4824 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4832 if (clipdist[2] >= 0.0f)
4834 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4841 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4848 else if (clipdist[1] >= 0.0f)
4850 if (clipdist[2] >= 0.0f)
4852 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4859 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4865 else if (clipdist[2] >= 0.0f)
4867 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4872 else continue; // triangle is entirely behind nearplane
4875 // calculate integer y coords for triangle points
4876 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4877 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4878 screenmin = _mm_min_epi16(screeni, screenir),
4879 screenmax = _mm_max_epi16(screeni, screenir);
4880 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4881 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4882 screenmin = _mm_max_epi16(screenmin, fbmin);
4883 screenmax = _mm_min_epi16(screenmax, fbmax);
4884 // skip offscreen triangles
4885 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4887 starty = _mm_extract_epi16(screenmin, 1);
4888 endy = _mm_extract_epi16(screenmax, 1)+1;
4889 if (starty >= maxy1 && endy <= miny2)
4891 screeny = _mm_srai_epi32(screeni, 16);
4894 triangle = &thread->triangles[thread->numtriangles];
4896 // calculate attribute plans for triangle data...
4897 // okay, this triangle is going to produce spans, we'd better project
4898 // the interpolants now (this is what gives perspective texturing),
4899 // this consists of simply multiplying all arrays by the W coord
4900 // (which is basically 1/Z), which will be undone per-pixel
4901 // (multiplying by Z again) to get the perspective-correct array
4904 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4905 __m128 mipedgescale, mipdensity;
4906 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4907 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4908 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4909 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4910 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4911 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4912 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4913 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4914 attribedge1 = _mm_sub_ss(w0, w1);
4915 attribedge2 = _mm_sub_ss(w2, w1);
4916 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4917 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4918 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4919 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4920 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4921 _mm_store_ss(&triangle->w[0], attribxslope);
4922 _mm_store_ss(&triangle->w[1], attribyslope);
4923 _mm_store_ss(&triangle->w[2], attriborigin);
4928 if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
4930 float cliporigin, clipxslope, clipyslope;
4931 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
4932 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
4933 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
4934 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4935 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4936 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4937 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
4938 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
4939 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
4942 clip0origin = -cliporigin/clipxslope;
4943 clip0slope = -clipyslope/clipxslope;
4944 clip0dir = clipxslope > 0 ? 1 : -1;
4946 else if(clipyslope > 0)
4948 clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
4949 clip0slope = dpsoftrast.fb_width;
4952 else if(clipyslope < 0)
4954 clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
4955 clip0slope = -dpsoftrast.fb_width;
4958 else if(clip0origin < 0) continue;
4961 mipedgescale = _mm_setzero_ps();
4962 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4964 __m128 attrib0, attrib1, attrib2;
4965 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4966 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4968 arrays += numvertices*4;
4969 GENATTRIBS(attrib0, attrib1, attrib2);
4970 attriborigin = _mm_mul_ps(attrib1, w1);
4971 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4972 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4973 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4974 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4975 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4976 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
4977 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
4978 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
4979 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4981 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4982 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4983 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4984 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4988 memset(triangle->mip, 0, sizeof(triangle->mip));
4989 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4991 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4992 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4994 texture = thread->texbound[texunit];
4995 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4997 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4998 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4999 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5000 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5001 // this will be multiplied in the texturing routine by the texture resolution
5002 y = _mm_cvtss_si32(mipdensity);
5005 y = (int)(log((float)y)*0.5f/M_LN2);
5006 if (y > texture->mipmaps - 1)
5007 y = texture->mipmaps - 1;
5008 triangle->mip[texunit] = y;
5014 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5017 __m128 xcoords, xslope;
5018 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5019 int yccmask = _mm_movemask_epi8(ycc);
5020 int edge0p, edge0n, edge1p, edge1n;
5029 case 0xFFFF: /*0000*/ y = endy; continue;
5030 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5031 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5032 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5033 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5034 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5035 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5036 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5037 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5038 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5039 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5040 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5041 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5042 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5043 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5044 case 0x0000: /*1111*/ y++; continue;
5052 case 0xFFFF: /*000*/ y = endy; continue;
5053 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5054 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5055 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5056 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5057 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5058 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5059 case 0x0000: /*111*/ y++; continue;
5062 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5063 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5064 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5065 nexty = _mm_extract_epi16(ycc, 0);
5066 if (nexty >= bandy) nexty = bandy-1;
5067 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5068 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5069 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5070 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5071 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5072 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5074 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5075 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5077 clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5078 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5080 int startx, endx, offset;
5081 startx = _mm_cvtss_si32(xcoords);
5082 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5083 if (startx < minx) startx = minx;
5084 if (endx > maxx) endx = maxx;
5085 if (startx >= endx) continue;
5093 if(endx <= clip0) continue;
5094 startx = (int)clip0;
5097 else if (endx > clip0)
5099 if(startx >= clip0) continue;
5104 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5106 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5107 span->triangle = thread->numtriangles;
5111 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5112 if (span->startx >= span->endx)
5114 wslope = triangle->w[0];
5115 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5116 span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5117 span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5118 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5119 DPSOFTRAST_Draw_ProcessSpans(thread);
5124 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5126 DPSOFTRAST_Draw_ProcessSpans(thread);
5127 thread->numtriangles = 0;
5131 if (!ATOMIC_DECREMENT(command->refcount))
5133 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5134 MM_FREE(command->arrays);
5137 if (thread->numspans > 0 || thread->numtriangles > 0)
5139 DPSOFTRAST_Draw_ProcessSpans(thread);
5140 thread->numtriangles = 0;
5145 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5149 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5150 int datasize = 2*numvertices*sizeof(float[4]);
5151 DPSOFTRAST_Command_Draw *command;
5152 unsigned char *data;
5153 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5155 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5156 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5158 datasize += numvertices*sizeof(float[4]);
5161 datasize += numtriangles*sizeof(unsigned short[3]);
5163 datasize += numtriangles*sizeof(int[3]);
5164 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5165 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5167 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5168 data = (unsigned char *)MM_CALLOC(datasize, 1);
5172 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5173 data = (unsigned char *)command + commandsize;
5175 command->firstvertex = firstvertex;
5176 command->numvertices = numvertices;
5177 command->numtriangles = numtriangles;
5178 command->arrays = (float *)data;
5179 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5180 dpsoftrast.firstvertex = firstvertex;
5181 dpsoftrast.numvertices = numvertices;
5182 dpsoftrast.screencoord4f = (float *)data;
5183 data += numvertices*sizeof(float[4]);
5184 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5185 data += numvertices*sizeof(float[4]);
5186 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5188 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5189 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5191 dpsoftrast.post_array4f[j] = (float *)data;
5192 data += numvertices*sizeof(float[4]);
5194 command->element3i = NULL;
5195 command->element3s = NULL;
5198 command->element3s = (unsigned short *)data;
5199 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5203 command->element3i = (int *)data;
5204 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5209 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5211 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5212 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5213 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5214 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5215 if (command->starty >= command->endy)
5217 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5218 MM_FREE(command->arrays);
5219 DPSOFTRAST_UndoCommand(command->commandsize);
5222 command->clipped = dpsoftrast.drawclipped;
5223 command->refcount = dpsoftrast.numthreads;
5225 if (dpsoftrast.usethreads)
5228 DPSOFTRAST_Draw_SyncCommands();
5229 for (i = 0; i < dpsoftrast.numthreads; i++)
5231 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5232 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5233 Thread_CondSignal(thread->drawcond);
5238 DPSOFTRAST_Draw_FlushThreads();
5242 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5243 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5245 thread->validate |= DPSOFTRAST_VALIDATE_FB;
5247 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5249 DPSOFTRAST_Command_SetRenderTargets *command;
5250 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5251 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5252 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5254 dpsoftrast.fb_width = width;
5255 dpsoftrast.fb_height = height;
5256 dpsoftrast.fb_depthpixels = depthpixels;
5257 dpsoftrast.fb_colorpixels[0] = colorpixels0;
5258 dpsoftrast.fb_colorpixels[1] = colorpixels1;
5259 dpsoftrast.fb_colorpixels[2] = colorpixels2;
5260 dpsoftrast.fb_colorpixels[3] = colorpixels3;
5261 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5262 command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5263 command->width = width;
5264 command->height = height;
5267 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5269 int commandoffset = thread->commandoffset;
5270 while (commandoffset != endoffset)
5272 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5273 switch (command->opcode)
5275 #define INTERPCOMMAND(name) \
5276 case DPSOFTRAST_OPCODE_##name : \
5277 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5278 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5279 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5280 commandoffset = 0; \
5282 INTERPCOMMAND(Viewport)
5283 INTERPCOMMAND(ClearColor)
5284 INTERPCOMMAND(ClearDepth)
5285 INTERPCOMMAND(ColorMask)
5286 INTERPCOMMAND(DepthTest)
5287 INTERPCOMMAND(ScissorTest)
5288 INTERPCOMMAND(Scissor)
5289 INTERPCOMMAND(BlendFunc)
5290 INTERPCOMMAND(BlendSubtract)
5291 INTERPCOMMAND(DepthMask)
5292 INTERPCOMMAND(DepthFunc)
5293 INTERPCOMMAND(DepthRange)
5294 INTERPCOMMAND(PolygonOffset)
5295 INTERPCOMMAND(CullFace)
5296 INTERPCOMMAND(AlphaTest)
5297 INTERPCOMMAND(AlphaFunc)
5298 INTERPCOMMAND(SetTexture)
5299 INTERPCOMMAND(SetShader)
5300 INTERPCOMMAND(Uniform4f)
5301 INTERPCOMMAND(UniformMatrix4f)
5302 INTERPCOMMAND(Uniform1i)
5303 INTERPCOMMAND(SetRenderTargets)
5304 INTERPCOMMAND(ClipPlane)
5306 case DPSOFTRAST_OPCODE_Draw:
5307 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5308 commandoffset += command->commandsize;
5309 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5311 thread->commandoffset = commandoffset;
5314 case DPSOFTRAST_OPCODE_Reset:
5319 thread->commandoffset = commandoffset;
5322 static int DPSOFTRAST_Draw_Thread(void *data)
5324 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5325 while(thread->index >= 0)
5327 if (thread->commandoffset != dpsoftrast.drawcommand)
5329 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5333 Thread_LockMutex(thread->drawmutex);
5334 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5336 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5337 thread->starving = true;
5338 Thread_CondWait(thread->drawcond, thread->drawmutex);
5339 thread->starving = false;
5341 Thread_UnlockMutex(thread->drawmutex);
5347 static void DPSOFTRAST_Draw_FlushThreads(void)
5349 DPSOFTRAST_State_Thread *thread;
5351 DPSOFTRAST_Draw_SyncCommands();
5352 if (dpsoftrast.usethreads)
5354 for (i = 0; i < dpsoftrast.numthreads; i++)
5356 thread = &dpsoftrast.threads[i];
5357 if (thread->commandoffset != dpsoftrast.drawcommand)
5359 Thread_LockMutex(thread->drawmutex);
5360 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5361 Thread_CondSignal(thread->drawcond);
5362 Thread_UnlockMutex(thread->drawmutex);
5365 for (i = 0; i < dpsoftrast.numthreads; i++)
5367 thread = &dpsoftrast.threads[i];
5368 if (thread->commandoffset != dpsoftrast.drawcommand)
5370 Thread_LockMutex(thread->drawmutex);
5371 if (thread->commandoffset != dpsoftrast.drawcommand)
5373 thread->waiting = true;
5374 Thread_CondWait(thread->waitcond, thread->drawmutex);
5375 thread->waiting = false;
5377 Thread_UnlockMutex(thread->drawmutex);
5383 for (i = 0; i < dpsoftrast.numthreads; i++)
5385 thread = &dpsoftrast.threads[i];
5386 if (thread->commandoffset != dpsoftrast.drawcommand)
5387 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5390 dpsoftrast.commandpool.usedcommands = 0;
5393 void DPSOFTRAST_Flush(void)
5395 DPSOFTRAST_Draw_FlushThreads();
5398 void DPSOFTRAST_Finish(void)
5403 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5413 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5414 dpsoftrast.bigendian = u.b[3];
5415 dpsoftrast.fb_width = width;
5416 dpsoftrast.fb_height = height;
5417 dpsoftrast.fb_depthpixels = depthpixels;
5418 dpsoftrast.fb_colorpixels[0] = colorpixels;
5419 dpsoftrast.fb_colorpixels[1] = NULL;
5420 dpsoftrast.fb_colorpixels[1] = NULL;
5421 dpsoftrast.fb_colorpixels[1] = NULL;
5422 dpsoftrast.viewport[0] = 0;
5423 dpsoftrast.viewport[1] = 0;
5424 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5425 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5426 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5427 dpsoftrast.texture_firstfree = 1;
5428 dpsoftrast.texture_end = 1;
5429 dpsoftrast.texture_max = 0;
5430 dpsoftrast.color[0] = 1;
5431 dpsoftrast.color[1] = 1;
5432 dpsoftrast.color[2] = 1;
5433 dpsoftrast.color[3] = 1;
5434 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5435 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5436 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5437 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5438 for (i = 0; i < dpsoftrast.numthreads; i++)
5440 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5442 thread->cullface = GL_BACK;
5443 thread->colormask[0] = 1;
5444 thread->colormask[1] = 1;
5445 thread->colormask[2] = 1;
5446 thread->colormask[3] = 1;
5447 thread->blendfunc[0] = GL_ONE;
5448 thread->blendfunc[1] = GL_ZERO;
5449 thread->depthmask = true;
5450 thread->depthtest = true;
5451 thread->depthfunc = GL_LEQUAL;
5452 thread->scissortest = false;
5453 thread->alphatest = false;
5454 thread->alphafunc = GL_GREATER;
5455 thread->alphavalue = 0.5f;
5456 thread->viewport[0] = 0;
5457 thread->viewport[1] = 0;
5458 thread->viewport[2] = dpsoftrast.fb_width;
5459 thread->viewport[3] = dpsoftrast.fb_height;
5460 thread->scissor[0] = 0;
5461 thread->scissor[1] = 0;
5462 thread->scissor[2] = dpsoftrast.fb_width;
5463 thread->scissor[3] = dpsoftrast.fb_height;
5464 thread->depthrange[0] = 0;
5465 thread->depthrange[1] = 1;
5466 thread->polygonoffset[0] = 0;
5467 thread->polygonoffset[1] = 0;
5468 thread->clipplane[0] = 0;
5469 thread->clipplane[1] = 0;
5470 thread->clipplane[2] = 0;
5471 thread->clipplane[3] = 1;
5473 thread->numspans = 0;
5474 thread->numtriangles = 0;
5475 thread->commandoffset = 0;
5476 thread->waiting = false;
5477 thread->starving = false;
5479 thread->validate = -1;
5480 DPSOFTRAST_Validate(thread, -1);
5482 if (dpsoftrast.usethreads)
5484 thread->waitcond = Thread_CreateCond();
5485 thread->drawcond = Thread_CreateCond();
5486 thread->drawmutex = Thread_CreateMutex();
5487 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5493 void DPSOFTRAST_Shutdown(void)
5496 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5498 DPSOFTRAST_State_Thread *thread;
5499 for (i = 0; i < dpsoftrast.numthreads; i++)
5501 thread = &dpsoftrast.threads[i];
5502 Thread_LockMutex(thread->drawmutex);
5504 Thread_CondSignal(thread->drawcond);
5505 Thread_UnlockMutex(thread->drawmutex);
5506 Thread_WaitThread(thread->thread, 0);
5507 Thread_DestroyCond(thread->waitcond);
5508 Thread_DestroyCond(thread->drawcond);
5509 Thread_DestroyMutex(thread->drawmutex);
5512 for (i = 0;i < dpsoftrast.texture_end;i++)
5513 if (dpsoftrast.texture[i].bytes)
5514 MM_FREE(dpsoftrast.texture[i].bytes);
5515 if (dpsoftrast.texture)
5516 free(dpsoftrast.texture);
5517 if (dpsoftrast.threads)
5518 MM_FREE(dpsoftrast.threads);
5519 memset(&dpsoftrast, 0, sizeof(dpsoftrast));