3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 #pragma warning(disable : 4324)
14 typedef qboolean bool;
18 #define ATOMIC_SIZE 32
21 #if defined(__APPLE__)
22 #include <libkern/OSAtomic.h>
23 #define ALIGN(var) var __attribute__((__aligned__(16)))
24 #define ATOMIC(var) var __attribute__((__aligned__(32)))
25 #define MEMORY_BARRIER (_mm_sfence())
26 #define ATOMIC_COUNTER volatile int32_t
27 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30 #elif defined(__GNUC__)
31 #define ALIGN(var) var __attribute__((__aligned__(16)))
32 #define ATOMIC(var) var __attribute__((__aligned__(32)))
33 #define MEMORY_BARRIER (_mm_sfence())
34 //(__sync_synchronize())
35 #define ATOMIC_COUNTER volatile int
36 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
37 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
38 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
39 #elif defined(_MSC_VER)
40 #define ALIGN(var) __declspec(align(16)) var
41 #define ATOMIC(var) __declspec(align(32)) var
42 #define MEMORY_BARRIER (_mm_sfence())
44 #define ATOMIC_COUNTER volatile LONG
45 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
46 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
47 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
52 #define ALIGN(var) var
55 #define ATOMIC(var) var
57 #ifndef MEMORY_BARRIER
58 #define MEMORY_BARRIER ((void)0)
60 #ifndef ATOMIC_COUNTER
61 #define ATOMIC_COUNTER int
63 #ifndef ATOMIC_INCREMENT
64 #define ATOMIC_INCREMENT(counter) (++(counter))
66 #ifndef ATOMIC_DECREMENT
67 #define ATOMIC_DECREMENT(counter) (--(counter))
70 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
74 #include <emmintrin.h>
76 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
78 static void *MM_CALLOC(size_t nmemb, size_t size)
80 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
81 if (ptr != NULL) memset(ptr, 0, nmemb*size);
85 #define MM_FREE _mm_free
87 #define MM_MALLOC(size) malloc(size)
88 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
92 typedef enum DPSOFTRAST_ARRAY_e
94 DPSOFTRAST_ARRAY_POSITION,
95 DPSOFTRAST_ARRAY_COLOR,
96 DPSOFTRAST_ARRAY_TEXCOORD0,
97 DPSOFTRAST_ARRAY_TEXCOORD1,
98 DPSOFTRAST_ARRAY_TEXCOORD2,
99 DPSOFTRAST_ARRAY_TEXCOORD3,
100 DPSOFTRAST_ARRAY_TEXCOORD4,
101 DPSOFTRAST_ARRAY_TEXCOORD5,
102 DPSOFTRAST_ARRAY_TEXCOORD6,
103 DPSOFTRAST_ARRAY_TEXCOORD7,
104 DPSOFTRAST_ARRAY_TOTAL
108 typedef struct DPSOFTRAST_Texture_s
115 DPSOFTRAST_TEXTURE_FILTER filter;
118 ATOMIC_COUNTER binds;
119 unsigned char *bytes;
120 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
124 #define COMMAND_SIZE ALIGN_SIZE
125 #define COMMAND_ALIGN(var) ALIGN(var)
127 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
129 unsigned char opcode;
130 unsigned short commandsize;
134 enum { DPSOFTRAST_OPCODE_Reset = 0 };
136 #define DEFCOMMAND(opcodeval, name, fields) \
137 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
138 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
140 unsigned char opcode; \
141 unsigned short commandsize; \
143 } DPSOFTRAST_Command_##name );
145 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
146 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
148 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
152 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
154 DPSOFTRAST_State_Command_Pool);
156 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
158 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
160 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
162 DPSOFTRAST_State_Triangle);
164 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
165 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
166 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
167 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
168 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
170 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
171 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
172 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
173 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
174 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
175 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
176 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
177 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
178 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
181 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
183 typedef ALIGN(struct DPSOFTRAST_State_Span_s
185 int triangle; // triangle this span was generated by
186 int x; // framebuffer x coord
187 int y; // framebuffer y coord
188 int startx; // usable range (according to pixelmask)
189 int endx; // usable range (according to pixelmask)
190 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
192 DPSOFTRAST_State_Span);
194 #define DPSOFTRAST_DRAW_MAXSPANS 1024
195 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
197 #define DPSOFTRAST_VALIDATE_FB 1
198 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
199 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
200 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
202 typedef enum DPSOFTRAST_BLENDMODE_e
204 DPSOFTRAST_BLENDMODE_OPAQUE,
205 DPSOFTRAST_BLENDMODE_ALPHA,
206 DPSOFTRAST_BLENDMODE_ADDALPHA,
207 DPSOFTRAST_BLENDMODE_ADD,
208 DPSOFTRAST_BLENDMODE_INVMOD,
209 DPSOFTRAST_BLENDMODE_MUL,
210 DPSOFTRAST_BLENDMODE_MUL2,
211 DPSOFTRAST_BLENDMODE_SUBALPHA,
212 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
213 DPSOFTRAST_BLENDMODE_INVADD,
214 DPSOFTRAST_BLENDMODE_TOTAL
216 DPSOFTRAST_BLENDMODE;
218 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
237 float polygonoffset[2];
240 int shader_permutation;
241 int shader_exactspecularmath;
243 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
245 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
246 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
248 // DPSOFTRAST_VALIDATE_ flags
251 // derived values (DPSOFTRAST_VALIDATE_FB)
254 ALIGN(float fb_viewportcenter[4]);
255 ALIGN(float fb_viewportscale[4]);
257 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
260 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
269 ATOMIC(volatile int commandoffset);
271 volatile bool waiting;
272 volatile bool starving;
279 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
280 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
282 DPSOFTRAST_State_Thread);
284 typedef ATOMIC(struct DPSOFTRAST_State_s
288 unsigned int *fb_depthpixels;
289 unsigned int *fb_colorpixels[4];
292 ALIGN(float fb_viewportcenter[4]);
293 ALIGN(float fb_viewportscale[4]);
296 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
297 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
299 const float *pointer_vertex3f;
300 const float *pointer_color4f;
301 const unsigned char *pointer_color4ub;
302 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
305 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
306 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
307 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
311 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
312 float *screencoord4f;
318 int shader_permutation;
319 int shader_exactspecularmath;
323 int texture_firstfree;
324 DPSOFTRAST_Texture *texture;
329 const char *errorstring;
334 DPSOFTRAST_State_Thread *threads;
336 ATOMIC(volatile int drawcommand);
338 DPSOFTRAST_State_Command_Pool commandpool;
342 DPSOFTRAST_State dpsoftrast;
344 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
345 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
346 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
347 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
348 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
350 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
352 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
353 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
354 fb_viewportcenter[3] = 0.5f;
355 fb_viewportcenter[0] = 0.0f;
356 fb_viewportscale[1] = 0.5f * viewport[2];
357 fb_viewportscale[2] = -0.5f * viewport[3];
358 fb_viewportscale[3] = 0.5f;
359 fb_viewportscale[0] = 1.0f;
362 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
364 if (dpsoftrast.interlace)
366 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
367 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
368 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
369 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
373 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
374 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
378 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
380 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
381 // and viewport projection values
384 x1 = thread->scissor[0];
385 x2 = thread->scissor[0] + thread->scissor[2];
386 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
387 y2 = dpsoftrast.fb_height - thread->scissor[1];
388 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
390 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
392 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
393 thread->fb_scissor[0] = x1;
394 thread->fb_scissor[1] = y1;
395 thread->fb_scissor[2] = x2 - x1;
396 thread->fb_scissor[3] = y2 - y1;
398 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
399 DPSOFTRAST_RecalcThread(thread);
402 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
404 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
407 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
409 if (thread->blendsubtract)
411 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
413 #define BLENDFUNC(sfactor, dfactor, blendmode) \
414 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
415 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
416 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
421 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
423 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
424 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
425 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
426 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
427 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
428 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
429 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
430 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
431 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
432 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
433 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
438 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
440 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
442 mask &= thread->validate;
445 if (mask & DPSOFTRAST_VALIDATE_FB)
447 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
448 DPSOFTRAST_RecalcFB(thread);
450 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
452 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
453 DPSOFTRAST_RecalcDepthFunc(thread);
455 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
457 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
458 DPSOFTRAST_RecalcBlendFunc(thread);
462 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
464 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
465 return &dpsoftrast.texture[index];
469 static void DPSOFTRAST_Texture_Grow(void)
471 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
472 DPSOFTRAST_State_Thread *thread;
476 // expand texture array as needed
477 if (dpsoftrast.texture_max < 1024)
478 dpsoftrast.texture_max = 1024;
480 dpsoftrast.texture_max *= 2;
481 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
482 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
483 if (dpsoftrast.texbound[i])
484 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
485 for (j = 0; j < dpsoftrast.numthreads; j++)
487 thread = &dpsoftrast.threads[j];
488 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
489 if (thread->texbound[i])
490 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
494 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
503 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
504 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
505 DPSOFTRAST_Texture *texture;
506 if (width*height*depth < 1)
508 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
511 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
513 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
518 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
519 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
520 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
522 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
523 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
525 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
530 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
533 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
535 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
540 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
542 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
545 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
547 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
550 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
552 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
555 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
557 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
560 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
562 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
565 // find first empty slot in texture array
566 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
567 if (!dpsoftrast.texture[texnum].bytes)
569 dpsoftrast.texture_firstfree = texnum + 1;
570 if (dpsoftrast.texture_max <= texnum)
571 DPSOFTRAST_Texture_Grow();
572 if (dpsoftrast.texture_end <= texnum)
573 dpsoftrast.texture_end = texnum + 1;
574 texture = &dpsoftrast.texture[texnum];
575 memset(texture, 0, sizeof(*texture));
576 texture->flags = flags;
577 texture->width = width;
578 texture->height = height;
579 texture->depth = depth;
580 texture->sides = sides;
592 s = w * h * d * sides * 4;
593 texture->mipmap[mipmaps][0] = size;
594 texture->mipmap[mipmaps][1] = s;
595 texture->mipmap[mipmaps][2] = w;
596 texture->mipmap[mipmaps][3] = h;
597 texture->mipmap[mipmaps][4] = d;
600 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
606 texture->mipmaps = mipmaps;
607 texture->size = size;
609 // allocate the pixels now
610 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
614 void DPSOFTRAST_Texture_Free(int index)
616 DPSOFTRAST_Texture *texture;
617 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
621 MM_FREE(texture->bytes);
622 texture->bytes = NULL;
623 memset(texture, 0, sizeof(*texture));
624 // adjust the free range and used range
625 if (dpsoftrast.texture_firstfree > index)
626 dpsoftrast.texture_firstfree = index;
627 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
628 dpsoftrast.texture_end--;
630 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
632 int i, x, y, z, w, layer0, layer1, row0, row1;
633 unsigned char *o, *i0, *i1, *i2, *i3;
634 DPSOFTRAST_Texture *texture;
635 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
636 if (texture->mipmaps <= 1)
638 for (i = 1;i < texture->mipmaps;i++)
640 for (z = 0;z < texture->mipmap[i][4];z++)
644 if (layer1 >= texture->mipmap[i-1][4])
645 layer1 = texture->mipmap[i-1][4]-1;
646 for (y = 0;y < texture->mipmap[i][3];y++)
650 if (row1 >= texture->mipmap[i-1][3])
651 row1 = texture->mipmap[i-1][3]-1;
652 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
653 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
654 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
655 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
656 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
657 w = texture->mipmap[i][2];
660 if (texture->mipmap[i-1][2] > 1)
662 // average 3D texture
663 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
665 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
666 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
667 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
668 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
673 // average 3D mipmap with parent width == 1
674 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
676 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
677 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
678 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
679 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
685 if (texture->mipmap[i-1][2] > 1)
687 // average 2D texture (common case)
688 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
690 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
691 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
692 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
693 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
698 // 2D texture with parent width == 1
699 o[0] = (i0[0] + i1[0] + 1) >> 1;
700 o[1] = (i0[1] + i1[1] + 1) >> 1;
701 o[2] = (i0[2] + i1[2] + 1) >> 1;
702 o[3] = (i0[3] + i1[3] + 1) >> 1;
709 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
711 DPSOFTRAST_Texture *texture;
713 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
718 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
719 while (blockheight > 0)
721 memcpy(dst, pixels, blockwidth * 4);
722 pixels += blockwidth * 4;
723 dst += texture->mipmap[0][2] * 4;
727 DPSOFTRAST_Texture_CalculateMipmaps(index);
729 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
731 DPSOFTRAST_Texture *texture;
732 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
736 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
737 DPSOFTRAST_Texture_CalculateMipmaps(index);
739 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
741 DPSOFTRAST_Texture *texture;
742 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
743 return texture->mipmap[mip][2];
745 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
747 DPSOFTRAST_Texture *texture;
748 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
749 return texture->mipmap[mip][3];
751 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
753 DPSOFTRAST_Texture *texture;
754 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
755 return texture->mipmap[mip][4];
757 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
759 DPSOFTRAST_Texture *texture;
760 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
763 return texture->bytes + texture->mipmap[mip][0];
765 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
767 DPSOFTRAST_Texture *texture;
768 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
769 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
771 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
776 texture->filter = filter;
779 static void DPSOFTRAST_Draw_FlushThreads(void);
781 static void DPSOFTRAST_Draw_SyncCommands(void)
783 if(dpsoftrast.usethreads) MEMORY_BARRIER;
784 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
787 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
789 DPSOFTRAST_State_Thread *thread;
791 int freecommand = dpsoftrast.commandpool.freecommand;
792 int usedcommands = dpsoftrast.commandpool.usedcommands;
793 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
795 DPSOFTRAST_Draw_SyncCommands();
801 for (i = 0; i < dpsoftrast.numthreads; i++)
803 thread = &dpsoftrast.threads[i];
804 commandoffset = freecommand - thread->commandoffset;
805 if (commandoffset < 0)
806 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
807 if (commandoffset > usedcommands)
810 usedcommands = commandoffset;
813 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
815 thread = &dpsoftrast.threads[waitindex];
816 Thread_LockMutex(thread->drawmutex);
817 if (thread->commandoffset != dpsoftrast.drawcommand)
819 thread->waiting = true;
820 if (thread->starving) Thread_CondSignal(thread->drawcond);
821 Thread_CondWait(thread->waitcond, thread->drawmutex);
822 thread->waiting = false;
824 Thread_UnlockMutex(thread->drawmutex);
826 dpsoftrast.commandpool.usedcommands = usedcommands;
829 #define DPSOFTRAST_ALIGNCOMMAND(size) \
830 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
831 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
832 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
834 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
836 DPSOFTRAST_Command *command;
837 int freecommand = dpsoftrast.commandpool.freecommand;
838 int usedcommands = dpsoftrast.commandpool.usedcommands;
839 int extra = sizeof(DPSOFTRAST_Command);
840 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
841 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
842 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
844 if (dpsoftrast.usethreads)
845 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
847 DPSOFTRAST_Draw_FlushThreads();
848 freecommand = dpsoftrast.commandpool.freecommand;
849 usedcommands = dpsoftrast.commandpool.usedcommands;
851 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
853 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
854 command->opcode = DPSOFTRAST_OPCODE_Reset;
855 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
858 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
859 command->opcode = opcode;
860 command->commandsize = size;
862 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
864 dpsoftrast.commandpool.freecommand = freecommand;
865 dpsoftrast.commandpool.usedcommands = usedcommands + size;
869 static void DPSOFTRAST_UndoCommand(int size)
871 int freecommand = dpsoftrast.commandpool.freecommand;
872 int usedcommands = dpsoftrast.commandpool.usedcommands;
875 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
876 usedcommands -= size;
877 dpsoftrast.commandpool.freecommand = freecommand;
878 dpsoftrast.commandpool.usedcommands = usedcommands;
881 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
882 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
884 thread->viewport[0] = command->x;
885 thread->viewport[1] = command->y;
886 thread->viewport[2] = command->width;
887 thread->viewport[3] = command->height;
888 thread->validate |= DPSOFTRAST_VALIDATE_FB;
890 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
892 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
895 command->width = width;
896 command->height = height;
898 dpsoftrast.viewport[0] = x;
899 dpsoftrast.viewport[1] = y;
900 dpsoftrast.viewport[2] = width;
901 dpsoftrast.viewport[3] = height;
902 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
905 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
906 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
908 int i, x1, y1, x2, y2, w, h, x, y;
909 int miny1, maxy1, miny2, maxy2;
913 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
914 miny1 = thread->miny1;
915 maxy1 = thread->maxy1;
916 miny2 = thread->miny2;
917 maxy2 = thread->maxy2;
918 x1 = thread->fb_scissor[0];
919 y1 = thread->fb_scissor[1];
920 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
921 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
922 if (y1 < miny1) y1 = miny1;
923 if (y2 > maxy2) y2 = maxy2;
928 // FIXME: honor fb_colormask?
929 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
930 for (i = 0;i < 4;i++)
932 if (!dpsoftrast.fb_colorpixels[i])
934 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
937 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
938 for (x = x1;x < x2;x++)
943 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
945 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
952 DEFCOMMAND(3, ClearDepth, float depth;)
953 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
955 int x1, y1, x2, y2, w, h, x, y;
956 int miny1, maxy1, miny2, maxy2;
960 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
961 miny1 = thread->miny1;
962 maxy1 = thread->maxy1;
963 miny2 = thread->miny2;
964 maxy2 = thread->maxy2;
965 x1 = thread->fb_scissor[0];
966 y1 = thread->fb_scissor[1];
967 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
968 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
969 if (y1 < miny1) y1 = miny1;
970 if (y2 > maxy2) y2 = maxy2;
975 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
976 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
979 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
980 for (x = x1;x < x2;x++)
984 void DPSOFTRAST_ClearDepth(float d)
986 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
990 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
991 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
993 thread->colormask[0] = command->r != 0;
994 thread->colormask[1] = command->g != 0;
995 thread->colormask[2] = command->b != 0;
996 thread->colormask[3] = command->a != 0;
997 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
999 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1001 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1008 DEFCOMMAND(5, DepthTest, int enable;)
1009 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1011 thread->depthtest = command->enable;
1012 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1014 void DPSOFTRAST_DepthTest(int enable)
1016 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1017 command->enable = enable;
1020 DEFCOMMAND(6, ScissorTest, int enable;)
1021 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1023 thread->scissortest = command->enable;
1024 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1026 void DPSOFTRAST_ScissorTest(int enable)
1028 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1029 command->enable = enable;
1032 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1033 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1035 thread->scissor[0] = command->x;
1036 thread->scissor[1] = command->y;
1037 thread->scissor[2] = command->width;
1038 thread->scissor[3] = command->height;
1039 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1041 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1043 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1046 command->width = width;
1047 command->height = height;
1050 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1051 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1053 thread->blendfunc[0] = command->sfactor;
1054 thread->blendfunc[1] = command->dfactor;
1055 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1057 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1059 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1060 command->sfactor = sfactor;
1061 command->dfactor = dfactor;
1064 DEFCOMMAND(9, BlendSubtract, int enable;)
1065 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1067 thread->blendsubtract = command->enable;
1068 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1070 void DPSOFTRAST_BlendSubtract(int enable)
1072 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1073 command->enable = enable;
1076 DEFCOMMAND(10, DepthMask, int enable;)
1077 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1079 thread->depthmask = command->enable;
1081 void DPSOFTRAST_DepthMask(int enable)
1083 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1084 command->enable = enable;
1087 DEFCOMMAND(11, DepthFunc, int func;)
1088 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1090 thread->depthfunc = command->func;
1092 void DPSOFTRAST_DepthFunc(int func)
1094 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1095 command->func = func;
1098 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1099 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1101 thread->depthrange[0] = command->nearval;
1102 thread->depthrange[1] = command->farval;
1104 void DPSOFTRAST_DepthRange(float nearval, float farval)
1106 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1107 command->nearval = nearval;
1108 command->farval = farval;
1111 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1112 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1114 thread->polygonoffset[0] = command->alongnormal;
1115 thread->polygonoffset[1] = command->intoview;
1117 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1119 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1120 command->alongnormal = alongnormal;
1121 command->intoview = intoview;
1124 DEFCOMMAND(14, CullFace, int mode;)
1125 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1127 thread->cullface = command->mode;
1129 void DPSOFTRAST_CullFace(int mode)
1131 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1132 command->mode = mode;
1135 DEFCOMMAND(15, AlphaTest, int enable;)
1136 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1138 thread->alphatest = command->enable;
1140 void DPSOFTRAST_AlphaTest(int enable)
1142 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1143 command->enable = enable;
1146 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1147 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1149 thread->alphafunc = command->func;
1150 thread->alphavalue = command->ref;
1152 void DPSOFTRAST_AlphaFunc(int func, float ref)
1154 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1155 command->func = func;
1159 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1161 dpsoftrast.color[0] = r;
1162 dpsoftrast.color[1] = g;
1163 dpsoftrast.color[2] = b;
1164 dpsoftrast.color[3] = a;
1167 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1169 int outstride = blockwidth * 4;
1170 int instride = dpsoftrast.fb_width * 4;
1173 int bx2 = blockx + blockwidth;
1174 int by2 = blocky + blockheight;
1178 unsigned char *inpixels;
1182 if (bx1 < 0) bx1 = 0;
1183 if (by1 < 0) by1 = 0;
1184 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1185 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1187 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1188 if (dpsoftrast.bigendian)
1190 for (y = by1;y < by2;y++)
1192 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1193 o = (unsigned char *)outpixels + (y - by1) * outstride;
1194 for (x = bx1;x < bx2;x++)
1207 for (y = by1;y < by2;y++)
1209 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1210 o = (unsigned char *)outpixels + (y - by1) * outstride;
1216 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1220 int tx2 = tx + width;
1221 int ty2 = ty + height;
1224 int sx2 = sx + width;
1225 int sy2 = sy + height;
1235 unsigned int *spixels;
1236 unsigned int *tpixels;
1237 DPSOFTRAST_Texture *texture;
1238 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1239 if (mip < 0 || mip >= texture->mipmaps) return;
1241 spixels = dpsoftrast.fb_colorpixels[0];
1242 swidth = dpsoftrast.fb_width;
1243 sheight = dpsoftrast.fb_height;
1244 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1245 twidth = texture->mipmap[mip][2];
1246 theight = texture->mipmap[mip][3];
1247 if (tx1 < 0) tx1 = 0;
1248 if (ty1 < 0) ty1 = 0;
1249 if (tx2 > twidth) tx2 = twidth;
1250 if (ty2 > theight) ty2 = theight;
1251 if (sx1 < 0) sx1 = 0;
1252 if (sy1 < 0) sy1 = 0;
1253 if (sx2 > swidth) sx2 = swidth;
1254 if (sy2 > sheight) sy2 = sheight;
1259 if (tw > sw) tw = sw;
1260 if (th > sh) th = sh;
1261 if (tw < 1 || th < 1)
1263 sy1 = sheight - 1 - sy1;
1264 for (y = 0;y < th;y++)
1265 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1266 if (texture->mipmaps > 1)
1267 DPSOFTRAST_Texture_CalculateMipmaps(index);
1270 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1271 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1273 if (thread->texbound[command->unitnum])
1274 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1275 thread->texbound[command->unitnum] = command->texture;
1277 void DPSOFTRAST_SetTexture(int unitnum, int index)
1279 DPSOFTRAST_Command_SetTexture *command;
1280 DPSOFTRAST_Texture *texture;
1281 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1283 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1286 texture = DPSOFTRAST_Texture_GetByIndex(index);
1287 if (index && !texture)
1289 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1293 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1294 command->unitnum = unitnum;
1295 command->texture = texture;
1297 dpsoftrast.texbound[unitnum] = texture;
1298 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1301 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1303 dpsoftrast.pointer_vertex3f = vertex3f;
1304 dpsoftrast.stride_vertex = stride;
1306 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1308 dpsoftrast.pointer_color4f = color4f;
1309 dpsoftrast.pointer_color4ub = NULL;
1310 dpsoftrast.stride_color = stride;
1312 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1314 dpsoftrast.pointer_color4f = NULL;
1315 dpsoftrast.pointer_color4ub = color4ub;
1316 dpsoftrast.stride_color = stride;
1318 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1320 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1321 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1322 dpsoftrast.stride_texcoord[unitnum] = stride;
1325 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1326 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1328 thread->shader_mode = command->mode;
1329 thread->shader_permutation = command->permutation;
1330 thread->shader_exactspecularmath = command->exactspecularmath;
1332 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1334 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1335 command->mode = mode;
1336 command->permutation = permutation;
1337 command->exactspecularmath = exactspecularmath;
1339 dpsoftrast.shader_mode = mode;
1340 dpsoftrast.shader_permutation = permutation;
1341 dpsoftrast.shader_exactspecularmath = exactspecularmath;
1344 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1345 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1347 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1349 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1351 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1352 command->index = index;
1353 command->val[0] = v0;
1354 command->val[1] = v1;
1355 command->val[2] = v2;
1356 command->val[3] = v3;
1358 dpsoftrast.uniform4f[index*4+0] = v0;
1359 dpsoftrast.uniform4f[index*4+1] = v1;
1360 dpsoftrast.uniform4f[index*4+2] = v2;
1361 dpsoftrast.uniform4f[index*4+3] = v3;
1363 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1365 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1366 command->index = index;
1367 memcpy(command->val, v, sizeof(command->val));
1369 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1372 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1373 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1375 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1377 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1381 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1383 __m128 m0, m1, m2, m3;
1384 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1385 command->index = (DPSOFTRAST_UNIFORM)index;
1386 if (((size_t)v)&(ALIGN_SIZE-1))
1388 m0 = _mm_loadu_ps(v);
1389 m1 = _mm_loadu_ps(v+4);
1390 m2 = _mm_loadu_ps(v+8);
1391 m3 = _mm_loadu_ps(v+12);
1395 m0 = _mm_load_ps(v);
1396 m1 = _mm_load_ps(v+4);
1397 m2 = _mm_load_ps(v+8);
1398 m3 = _mm_load_ps(v+12);
1402 __m128 t0, t1, t2, t3;
1403 t0 = _mm_unpacklo_ps(m0, m1);
1404 t1 = _mm_unpacklo_ps(m2, m3);
1405 t2 = _mm_unpackhi_ps(m0, m1);
1406 t3 = _mm_unpackhi_ps(m2, m3);
1407 m0 = _mm_movelh_ps(t0, t1);
1408 m1 = _mm_movehl_ps(t1, t0);
1409 m2 = _mm_movelh_ps(t2, t3);
1410 m3 = _mm_movehl_ps(t3, t2);
1412 _mm_store_ps(command->val, m0);
1413 _mm_store_ps(command->val+4, m1);
1414 _mm_store_ps(command->val+8, m2);
1415 _mm_store_ps(command->val+12, m3);
1416 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1417 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1418 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1419 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1424 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1425 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1427 thread->uniform1i[command->index] = command->val;
1429 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1431 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1432 command->index = index;
1435 dpsoftrast.uniform1i[command->index] = i0;
1439 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1441 float *end = dst + size*4;
1442 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1446 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1455 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1462 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1464 float *end = dst + size*4;
1465 if (stride == sizeof(float[3]))
1467 float *end4 = dst + (size&~3)*4;
1468 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1472 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1473 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1474 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1475 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1476 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1477 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1478 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1479 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1480 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1481 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1482 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1483 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1484 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1486 src += 4*sizeof(float[3]);
1493 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1494 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1495 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1496 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1497 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1498 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1499 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1500 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1501 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1502 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1503 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1504 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1505 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1507 src += 4*sizeof(float[3]);
1511 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1515 __m128 v = _mm_loadu_ps((const float *)src);
1516 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1517 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1518 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1519 _mm_store_ps(dst, v);
1528 __m128 v = _mm_load_ps((const float *)src);
1529 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1530 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1531 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1532 _mm_store_ps(dst, v);
1539 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1541 float *end = dst + size*4;
1542 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1543 if (stride == sizeof(float[2]))
1545 float *end2 = dst + (size&~1)*4;
1546 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1550 __m128 v = _mm_loadu_ps((const float *)src);
1551 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1552 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1554 src += 2*sizeof(float[2]);
1561 __m128 v = _mm_load_ps((const float *)src);
1562 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1563 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1565 src += 2*sizeof(float[2]);
1571 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1577 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1579 float *end = dst + size*4;
1580 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1581 if (stride == sizeof(unsigned char[4]))
1583 float *end4 = dst + (size&~3)*4;
1584 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1588 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1589 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1590 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1591 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1592 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1594 src += 4*sizeof(unsigned char[4]);
1601 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1602 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1603 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1604 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1605 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1607 src += 4*sizeof(unsigned char[4]);
1613 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1614 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1620 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1622 float *end = dst + 4*size;
1623 __m128 v = _mm_loadu_ps(src);
1626 _mm_store_ps(dst, v);
1632 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1635 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1636 __m128 m0, m1, m2, m3;
1638 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1640 // fast case for identity matrix
1641 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1644 end = out4f + numitems*4;
1645 m0 = _mm_loadu_ps(inmatrix16f);
1646 m1 = _mm_loadu_ps(inmatrix16f + 4);
1647 m2 = _mm_loadu_ps(inmatrix16f + 8);
1648 m3 = _mm_loadu_ps(inmatrix16f + 12);
1649 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1653 __m128 v = _mm_loadu_ps(in4f);
1655 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1656 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1657 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1658 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1667 __m128 v = _mm_load_ps(in4f);
1669 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1670 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1671 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1672 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1680 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1682 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1686 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1688 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1689 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1690 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1691 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1694 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1696 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1697 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1698 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1699 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1702 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1705 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1706 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1707 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1708 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1711 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1713 int clipmask = 0xFF;
1714 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1715 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1716 __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1717 __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1718 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1719 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1720 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1721 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1722 #define BBFRONT(k, pos) \
1724 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1725 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1726 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1729 clipmask &= ~(1<<k); \
1730 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1731 minproj = _mm_min_ss(minproj, proj); \
1732 maxproj = _mm_max_ss(maxproj, proj); \
1736 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1737 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1738 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1739 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1740 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1741 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1745 if (clipmask&(1<<k)) \
1747 if (!(clipmask&(1<<(k^1)))) \
1749 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1750 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1751 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1752 minproj = _mm_min_ss(minproj, proj); \
1753 maxproj = _mm_max_ss(maxproj, proj); \
1755 if (!(clipmask&(1<<(k^2)))) \
1757 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1758 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1759 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1760 minproj = _mm_min_ss(minproj, proj); \
1761 maxproj = _mm_max_ss(maxproj, proj); \
1763 if (!(clipmask&(1<<(k^4)))) \
1765 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1766 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1767 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1768 minproj = _mm_min_ss(minproj, proj); \
1769 maxproj = _mm_max_ss(maxproj, proj); \
1773 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1774 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1775 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1776 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1777 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1778 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1779 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1780 *starty = _mm_cvttss_si32(maxproj);
1781 *endy = _mm_cvttss_si32(minproj)+1;
1785 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1787 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1788 float *end = out4f + numitems*4;
1789 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1790 __m128 minpos, maxpos;
1791 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1793 minpos = maxpos = _mm_loadu_ps(in4f);
1796 __m128 v = _mm_loadu_ps(in4f);
1797 minpos = _mm_min_ps(minpos, v);
1798 maxpos = _mm_max_ps(maxpos, v);
1799 _mm_store_ps(out4f, v);
1800 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1801 _mm_store_ps(screen4f, v);
1809 minpos = maxpos = _mm_load_ps(in4f);
1812 __m128 v = _mm_load_ps(in4f);
1813 minpos = _mm_min_ps(minpos, v);
1814 maxpos = _mm_max_ps(maxpos, v);
1815 _mm_store_ps(out4f, v);
1816 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1817 _mm_store_ps(screen4f, v);
1825 ALIGN(float minposf[4]);
1826 ALIGN(float maxposf[4]);
1827 _mm_store_ps(minposf, minpos);
1828 _mm_store_ps(maxposf, maxpos);
1829 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1834 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1836 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1837 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1839 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1840 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1841 end = out4f + numitems*4;
1842 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1843 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1844 m0 = _mm_loadu_ps(inmatrix16f);
1845 m1 = _mm_loadu_ps(inmatrix16f + 4);
1846 m2 = _mm_loadu_ps(inmatrix16f + 8);
1847 m3 = _mm_loadu_ps(inmatrix16f + 12);
1848 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1850 minpos = maxpos = _mm_loadu_ps(in4f);
1853 __m128 v = _mm_loadu_ps(in4f);
1854 minpos = _mm_min_ps(minpos, v);
1855 maxpos = _mm_max_ps(maxpos, v);
1856 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1857 _mm_store_ps(out4f, v);
1858 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1859 _mm_store_ps(screen4f, v);
1867 minpos = maxpos = _mm_load_ps(in4f);
1870 __m128 v = _mm_load_ps(in4f);
1871 minpos = _mm_min_ps(minpos, v);
1872 maxpos = _mm_max_ps(maxpos, v);
1873 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1874 _mm_store_ps(out4f, v);
1875 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1876 _mm_store_ps(screen4f, v);
1884 ALIGN(float minposf[4]);
1885 ALIGN(float maxposf[4]);
1886 _mm_store_ps(minposf, minpos);
1887 _mm_store_ps(maxposf, maxpos);
1888 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f);
1894 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1897 float *outf = dpsoftrast.post_array4f[outarray];
1898 const unsigned char *inb;
1899 int firstvertex = dpsoftrast.firstvertex;
1900 int numvertices = dpsoftrast.numvertices;
1904 case DPSOFTRAST_ARRAY_POSITION:
1905 stride = dpsoftrast.stride_vertex;
1906 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1907 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1909 case DPSOFTRAST_ARRAY_COLOR:
1910 stride = dpsoftrast.stride_color;
1911 if (dpsoftrast.pointer_color4f)
1913 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1914 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1916 else if (dpsoftrast.pointer_color4ub)
1918 stride = dpsoftrast.stride_color;
1919 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1920 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1924 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1928 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1929 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1931 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1932 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1935 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1938 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1941 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1953 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1955 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1956 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1961 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1964 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1965 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1973 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1976 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1977 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1984 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1987 int startx = span->startx;
1988 int endx = span->endx;
1989 float wslope = triangle->w[0];
1990 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1991 float endz = 1.0f / (w + wslope * startx);
1992 for (x = startx;x < endx;)
1994 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1996 if (nextsub >= endx) nextsub = endsub = endx-1;
1997 endz = 1.0f / (w + wslope * nextsub);
1998 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1999 for (; x <= endsub; x++, z += dz)
2004 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
2007 int startx = span->startx;
2008 int endx = span->endx;
2011 unsigned char * RESTRICT pixelmask = span->pixelmask;
2012 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2015 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2016 // handle alphatest now (this affects depth writes too)
2017 if (thread->alphatest)
2018 for (x = startx;x < endx;x++)
2019 if (in4f[x*4+3] < 0.5f)
2020 pixelmask[x] = false;
2021 // FIXME: this does not handle bigendian
2022 switch(thread->fb_blendmode)
2024 case DPSOFTRAST_BLENDMODE_OPAQUE:
2025 for (x = startx;x < endx;x++)
2029 d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
2030 d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
2031 d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
2032 d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
2033 pixel[x*4+0] = d[0];
2034 pixel[x*4+1] = d[1];
2035 pixel[x*4+2] = d[2];
2036 pixel[x*4+3] = d[3];
2039 case DPSOFTRAST_BLENDMODE_ALPHA:
2040 for (x = startx;x < endx;x++)
2044 a = in4f[x*4+3] * 255.0f;
2045 b = 1.0f - in4f[x*4+3];
2046 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2047 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2048 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2049 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2050 pixel[x*4+0] = d[0];
2051 pixel[x*4+1] = d[1];
2052 pixel[x*4+2] = d[2];
2053 pixel[x*4+3] = d[3];
2056 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2057 for (x = startx;x < endx;x++)
2061 a = in4f[x*4+3] * 255.0f;
2062 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2063 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2064 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2065 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2066 pixel[x*4+0] = d[0];
2067 pixel[x*4+1] = d[1];
2068 pixel[x*4+2] = d[2];
2069 pixel[x*4+3] = d[3];
2072 case DPSOFTRAST_BLENDMODE_ADD:
2073 for (x = startx;x < endx;x++)
2077 d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2078 d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2079 d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2080 d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2081 pixel[x*4+0] = d[0];
2082 pixel[x*4+1] = d[1];
2083 pixel[x*4+2] = d[2];
2084 pixel[x*4+3] = d[3];
2087 case DPSOFTRAST_BLENDMODE_INVMOD:
2088 for (x = startx;x < endx;x++)
2092 d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2093 d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2094 d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2095 d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2096 pixel[x*4+0] = d[0];
2097 pixel[x*4+1] = d[1];
2098 pixel[x*4+2] = d[2];
2099 pixel[x*4+3] = d[3];
2102 case DPSOFTRAST_BLENDMODE_MUL:
2103 for (x = startx;x < endx;x++)
2107 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2108 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2109 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2110 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2111 pixel[x*4+0] = d[0];
2112 pixel[x*4+1] = d[1];
2113 pixel[x*4+2] = d[2];
2114 pixel[x*4+3] = d[3];
2117 case DPSOFTRAST_BLENDMODE_MUL2:
2118 for (x = startx;x < endx;x++)
2122 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2123 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2124 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2125 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2126 pixel[x*4+0] = d[0];
2127 pixel[x*4+1] = d[1];
2128 pixel[x*4+2] = d[2];
2129 pixel[x*4+3] = d[3];
2132 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2133 for (x = startx;x < endx;x++)
2137 a = in4f[x*4+3] * -255.0f;
2138 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2139 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2140 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2141 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2142 pixel[x*4+0] = d[0];
2143 pixel[x*4+1] = d[1];
2144 pixel[x*4+2] = d[2];
2145 pixel[x*4+3] = d[3];
2148 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2149 for (x = startx;x < endx;x++)
2154 b = 1.0f - in4f[x*4+3];
2155 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2156 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2157 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2158 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2159 pixel[x*4+0] = d[0];
2160 pixel[x*4+1] = d[1];
2161 pixel[x*4+2] = d[2];
2162 pixel[x*4+3] = d[3];
2165 case DPSOFTRAST_BLENDMODE_INVADD:
2166 for (x = startx;x < endx;x++)
2170 d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2171 d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2172 d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2173 d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2174 pixel[x*4+0] = d[0];
2175 pixel[x*4+1] = d[1];
2176 pixel[x*4+2] = d[2];
2177 pixel[x*4+3] = d[3];
2183 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2187 int startx = span->startx;
2188 int endx = span->endx;
2189 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2190 unsigned char * RESTRICT pixelmask = span->pixelmask;
2191 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2192 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2195 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2196 pixeli += span->y * dpsoftrast.fb_width + span->x;
2197 // handle alphatest now (this affects depth writes too)
2198 if (thread->alphatest)
2199 for (x = startx;x < endx;x++)
2200 if (in4ub[x*4+3] < 0.5f)
2201 pixelmask[x] = false;
2202 // FIXME: this does not handle bigendian
2203 switch(thread->fb_blendmode)
2205 case DPSOFTRAST_BLENDMODE_OPAQUE:
2206 for (x = startx;x + 4 <= endx;)
2208 if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2210 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2224 case DPSOFTRAST_BLENDMODE_ALPHA:
2225 #define FINISHBLEND(blend2, blend1) \
2226 for (x = startx;x + 1 < endx;x += 2) \
2229 switch (*(const unsigned short*)&pixelmask[x]) \
2232 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2233 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2235 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2238 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2239 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2241 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2244 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2245 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2247 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2252 for(;x < endx; x++) \
2255 if (!pixelmask[x]) \
2257 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2258 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2260 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2264 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2265 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2267 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2268 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2271 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2273 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2274 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2276 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2277 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2280 case DPSOFTRAST_BLENDMODE_ADD:
2281 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2283 case DPSOFTRAST_BLENDMODE_INVMOD:
2285 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2287 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2290 case DPSOFTRAST_BLENDMODE_MUL:
2291 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2293 case DPSOFTRAST_BLENDMODE_MUL2:
2294 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2296 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2298 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2299 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2301 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2302 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2305 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2307 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2308 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2310 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2311 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2314 case DPSOFTRAST_BLENDMODE_INVADD:
2316 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2318 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2325 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2328 int startx = span->startx;
2329 int endx = span->endx;
2334 float tc[2], endtc[2];
2336 unsigned int tci[2];
2337 unsigned int tci1[2];
2338 unsigned int tcimin[2];
2339 unsigned int tcimax[2];
2344 const unsigned char * RESTRICT pixelbase;
2345 const unsigned char * RESTRICT pixel[4];
2346 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2347 // if no texture is bound, just fill it with white
2350 for (x = startx;x < endx;x++)
2352 out4f[x*4+0] = 1.0f;
2353 out4f[x*4+1] = 1.0f;
2354 out4f[x*4+2] = 1.0f;
2355 out4f[x*4+3] = 1.0f;
2359 mip = triangle->mip[texunitindex];
2360 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2361 // if this mipmap of the texture is 1 pixel, just fill it with that color
2362 if (texture->mipmap[mip][1] == 4)
2364 c[0] = texture->bytes[2] * (1.0f/255.0f);
2365 c[1] = texture->bytes[1] * (1.0f/255.0f);
2366 c[2] = texture->bytes[0] * (1.0f/255.0f);
2367 c[3] = texture->bytes[3] * (1.0f/255.0f);
2368 for (x = startx;x < endx;x++)
2370 out4f[x*4+0] = c[0];
2371 out4f[x*4+1] = c[1];
2372 out4f[x*4+2] = c[2];
2373 out4f[x*4+3] = c[3];
2377 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2378 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2379 flags = texture->flags;
2380 tcscale[0] = texture->mipmap[mip][2];
2381 tcscale[1] = texture->mipmap[mip][3];
2382 tciwidth = texture->mipmap[mip][2];
2385 tcimax[0] = texture->mipmap[mip][2]-1;
2386 tcimax[1] = texture->mipmap[mip][3]-1;
2387 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2388 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2389 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2390 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2396 for (x = startx;x < endx;)
2398 unsigned int subtc[2];
2399 unsigned int substep[2];
2400 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2401 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2402 if (nextsub >= endx)
2404 nextsub = endsub = endx-1;
2405 if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2409 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2410 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2416 substep[0] = (endtc[0] - tc[0]) * subscale;
2417 substep[1] = (endtc[1] - tc[1]) * subscale;
2418 subtc[0] = tc[0] * (1<<12);
2419 subtc[1] = tc[1] * (1<<12);
2422 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2424 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2426 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2427 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2428 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2429 tci[0] = subtc[0]>>12;
2430 tci[1] = subtc[1]>>12;
2431 tci1[0] = tci[0] + 1;
2432 tci1[1] = tci[1] + 1;
2433 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2434 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2435 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2436 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2437 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2438 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2439 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2440 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2441 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2442 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2443 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2444 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2445 out4f[x*4+0] = c[0];
2446 out4f[x*4+1] = c[1];
2447 out4f[x*4+2] = c[2];
2448 out4f[x*4+3] = c[3];
2453 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2455 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2456 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2457 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2458 tci[0] = subtc[0]>>12;
2459 tci[1] = subtc[1]>>12;
2460 tci1[0] = tci[0] + 1;
2461 tci1[1] = tci[1] + 1;
2462 tci[0] &= tciwrapmask[0];
2463 tci[1] &= tciwrapmask[1];
2464 tci1[0] &= tciwrapmask[0];
2465 tci1[1] &= tciwrapmask[1];
2466 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2467 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2468 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2469 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2470 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2471 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2472 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2473 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2474 out4f[x*4+0] = c[0];
2475 out4f[x*4+1] = c[1];
2476 out4f[x*4+2] = c[2];
2477 out4f[x*4+3] = c[3];
2481 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2483 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2485 tci[0] = subtc[0]>>12;
2486 tci[1] = subtc[1]>>12;
2487 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2488 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2489 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2490 c[0] = pixel[0][2] * (1.0f / 255.0f);
2491 c[1] = pixel[0][1] * (1.0f / 255.0f);
2492 c[2] = pixel[0][0] * (1.0f / 255.0f);
2493 c[3] = pixel[0][3] * (1.0f / 255.0f);
2494 out4f[x*4+0] = c[0];
2495 out4f[x*4+1] = c[1];
2496 out4f[x*4+2] = c[2];
2497 out4f[x*4+3] = c[3];
2502 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2504 tci[0] = subtc[0]>>12;
2505 tci[1] = subtc[1]>>12;
2506 tci[0] &= tciwrapmask[0];
2507 tci[1] &= tciwrapmask[1];
2508 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2509 c[0] = pixel[0][2] * (1.0f / 255.0f);
2510 c[1] = pixel[0][1] * (1.0f / 255.0f);
2511 c[2] = pixel[0][0] * (1.0f / 255.0f);
2512 c[3] = pixel[0][3] * (1.0f / 255.0f);
2513 out4f[x*4+0] = c[0];
2514 out4f[x*4+1] = c[1];
2515 out4f[x*4+2] = c[2];
2516 out4f[x*4+3] = c[3];
2522 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2526 int startx = span->startx;
2527 int endx = span->endx;
2529 __m128 data, slope, tcscale;
2530 __m128i tcsize, tcmask, tcoffset, tcmax;
2532 __m128i subtc, substep, endsubtc;
2535 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2536 const unsigned char * RESTRICT pixelbase;
2537 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2538 // if no texture is bound, just fill it with white
2541 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2544 mip = triangle->mip[texunitindex];
2545 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2546 // if this mipmap of the texture is 1 pixel, just fill it with that color
2547 if (texture->mipmap[mip][1] == 4)
2549 unsigned int k = *((const unsigned int *)pixelbase);
2550 for (x = startx;x < endx;x++)
2554 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2555 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2556 flags = texture->flags;
2557 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2558 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2559 tcscale = _mm_cvtepi32_ps(tcsize);
2560 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2561 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2562 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2564 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2565 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2566 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2567 tcmax = _mm_packs_epi32(tcmask, tcmask);
2568 for (x = startx;x < endx;)
2570 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2571 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2572 if (nextsub >= endx)
2574 nextsub = endsub = endx-1;
2575 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2579 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2581 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2582 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2583 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2584 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2585 substep = _mm_slli_epi32(substep, 1);
2588 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2589 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2591 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2592 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2594 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2595 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2596 tci = _mm_madd_epi16(tci, tcoffset);
2597 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2598 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2599 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2600 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2601 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2602 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2603 fracm = _mm_srli_epi16(subtc, 1);
2604 pix1 = _mm_add_epi16(pix1,
2605 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2606 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2607 pix3 = _mm_add_epi16(pix3,
2608 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2609 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2610 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2611 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2612 pix2 = _mm_add_epi16(pix2,
2613 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2614 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2615 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2619 const unsigned char * RESTRICT ptr1;
2620 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2621 tci = _mm_madd_epi16(tci, tcoffset);
2622 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2623 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2624 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2625 fracm = _mm_srli_epi16(subtc, 1);
2626 pix1 = _mm_add_epi16(pix1,
2627 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2628 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2629 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2630 pix1 = _mm_add_epi16(pix1,
2631 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2632 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2633 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2637 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2639 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2641 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2642 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2643 tci = _mm_madd_epi16(tci, tcoffset);
2644 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2645 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2646 _mm_setzero_si128());
2647 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2648 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2649 _mm_setzero_si128());
2650 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2651 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2652 tci = _mm_madd_epi16(tci, tcoffset);
2653 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2654 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2655 _mm_setzero_si128());
2656 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2657 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2658 _mm_setzero_si128());
2659 fracm = _mm_srli_epi16(subtc, 1);
2660 pix1 = _mm_add_epi16(pix1,
2661 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2662 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2663 pix3 = _mm_add_epi16(pix3,
2664 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2665 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2666 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2667 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2668 pix2 = _mm_add_epi16(pix2,
2669 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2670 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2671 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2675 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2676 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2677 tci = _mm_madd_epi16(tci, tcoffset);
2678 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2679 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2680 _mm_setzero_si128());
2681 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2682 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2683 _mm_setzero_si128());
2684 fracm = _mm_srli_epi16(subtc, 1);
2685 pix1 = _mm_add_epi16(pix1,
2686 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2687 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2688 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2689 pix1 = _mm_add_epi16(pix1,
2690 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2691 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2692 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2698 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2700 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2701 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2702 tci = _mm_madd_epi16(tci, tcoffset);
2703 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2704 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2705 _mm_setzero_si128());
2706 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2707 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2708 _mm_setzero_si128());
2709 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2710 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2711 tci = _mm_madd_epi16(tci, tcoffset);
2712 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2713 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2714 _mm_setzero_si128());
2715 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2716 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2717 _mm_setzero_si128());
2718 fracm = _mm_srli_epi16(subtc, 1);
2719 pix1 = _mm_add_epi16(pix1,
2720 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2721 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2722 pix3 = _mm_add_epi16(pix3,
2723 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2724 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2725 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2726 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2727 pix2 = _mm_add_epi16(pix2,
2728 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2729 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2730 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2734 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2735 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2736 tci = _mm_madd_epi16(tci, tcoffset);
2737 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2738 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2739 _mm_setzero_si128());
2740 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2741 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2742 _mm_setzero_si128());
2743 fracm = _mm_srli_epi16(subtc, 1);
2744 pix1 = _mm_add_epi16(pix1,
2745 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2746 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2747 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2748 pix1 = _mm_add_epi16(pix1,
2749 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2750 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2751 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2758 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2760 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2762 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2763 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2764 tci = _mm_madd_epi16(tci, tcoffset);
2765 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2766 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2770 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2771 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2772 tci = _mm_madd_epi16(tci, tcoffset);
2773 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2779 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2781 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2782 tci = _mm_and_si128(tci, tcmax);
2783 tci = _mm_madd_epi16(tci, tcoffset);
2784 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2785 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2789 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2790 tci = _mm_and_si128(tci, tcmax);
2791 tci = _mm_madd_epi16(tci, tcoffset);
2792 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2801 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2804 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2807 float DPSOFTRAST_SampleShadowmap(const float *vector)
2813 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2816 int startx = span->startx;
2817 int endx = span->endx;
2822 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2823 for (x = startx;x < endx;x++)
2826 c[0] = (data[0] + slope[0]*x) * z;
2827 c[1] = (data[1] + slope[1]*x) * z;
2828 c[2] = (data[2] + slope[2]*x) * z;
2829 c[3] = (data[3] + slope[3]*x) * z;
2830 out4f[x*4+0] = in4f[x*4+0] * c[0];
2831 out4f[x*4+1] = in4f[x*4+1] * c[1];
2832 out4f[x*4+2] = in4f[x*4+2] * c[2];
2833 out4f[x*4+3] = in4f[x*4+3] * c[3];
2837 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2840 int startx = span->startx;
2841 int endx = span->endx;
2846 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2847 for (x = startx;x < endx;x++)
2850 c[0] = (data[0] + slope[0]*x) * z;
2851 c[1] = (data[1] + slope[1]*x) * z;
2852 c[2] = (data[2] + slope[2]*x) * z;
2853 c[3] = (data[3] + slope[3]*x) * z;
2854 out4f[x*4+0] = c[0];
2855 out4f[x*4+1] = c[1];
2856 out4f[x*4+2] = c[2];
2857 out4f[x*4+3] = c[3];
2861 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2863 int x, startx = span->startx, endx = span->endx;
2864 float c[4], localcolor[4];
2865 localcolor[0] = subcolor[0];
2866 localcolor[1] = subcolor[1];
2867 localcolor[2] = subcolor[2];
2868 localcolor[3] = subcolor[3];
2869 for (x = startx;x < endx;x++)
2871 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2872 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2873 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2874 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2875 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2876 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2877 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2878 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2882 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2884 int x, startx = span->startx, endx = span->endx;
2885 for (x = startx;x < endx;x++)
2887 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2888 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2889 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2890 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2894 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2896 int x, startx = span->startx, endx = span->endx;
2897 for (x = startx;x < endx;x++)
2899 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2900 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2901 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2902 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2906 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2908 int x, startx = span->startx, endx = span->endx;
2910 for (x = startx;x < endx;x++)
2912 a = 1.0f - inb4f[x*4+3];
2914 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2915 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2916 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2917 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2921 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2923 int x, startx = span->startx, endx = span->endx;
2924 float localcolor[4], ilerp, lerp;
2925 localcolor[0] = color[0];
2926 localcolor[1] = color[1];
2927 localcolor[2] = color[2];
2928 localcolor[3] = color[3];
2929 ilerp = 1.0f - localcolor[3];
2930 lerp = localcolor[3];
2931 for (x = startx;x < endx;x++)
2933 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2934 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2935 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2936 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2942 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2946 int startx = span->startx;
2947 int endx = span->endx;
2950 __m128i submod, substep, endsubmod;
2951 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2952 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2953 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2954 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2955 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2956 for (x = startx; x < endx;)
2958 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2959 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2960 if (nextsub >= endx)
2962 nextsub = endsub = endx-1;
2963 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2967 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2968 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2969 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2970 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2971 substep = _mm_packs_epi32(substep, substep);
2972 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2974 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2975 pix = _mm_mulhi_epu16(pix, submod);
2976 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2980 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2981 pix = _mm_mulhi_epu16(pix, submod);
2982 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2989 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2993 int startx = span->startx;
2994 int endx = span->endx;
2997 __m128i submod, substep, endsubmod;
2998 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2999 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3000 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3001 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3002 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3003 for (x = startx; x < endx;)
3005 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3006 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3007 if (nextsub >= endx)
3009 nextsub = endsub = endx-1;
3010 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3014 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3015 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3016 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3017 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3018 substep = _mm_packs_epi32(substep, substep);
3019 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3021 __m128i pix = _mm_srai_epi16(submod, 4);
3022 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3026 __m128i pix = _mm_srai_epi16(submod, 4);
3027 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3034 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3037 int x, startx = span->startx, endx = span->endx;
3038 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3039 localcolor = _mm_packs_epi32(localcolor, localcolor);
3040 for (x = startx;x+2 <= endx;x+=2)
3042 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3043 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3044 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3045 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3049 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3050 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3051 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3052 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3057 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3060 int x, startx = span->startx, endx = span->endx;
3061 for (x = startx;x+2 <= endx;x+=2)
3063 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3064 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3065 pix1 = _mm_mulhi_epu16(pix1, pix2);
3066 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3070 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3071 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3072 pix1 = _mm_mulhi_epu16(pix1, pix2);
3073 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3078 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3081 int x, startx = span->startx, endx = span->endx;
3082 for (x = startx;x+2 <= endx;x+=2)
3084 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3085 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3086 pix1 = _mm_add_epi16(pix1, pix2);
3087 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3091 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3092 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3093 pix1 = _mm_add_epi16(pix1, pix2);
3094 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3099 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3102 int x, startx = span->startx, endx = span->endx;
3103 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3104 tint = _mm_packs_epi32(tint, tint);
3105 for (x = startx;x+2 <= endx;x+=2)
3107 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3108 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3109 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3110 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3114 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3115 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3116 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3117 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3122 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3125 int x, startx = span->startx, endx = span->endx;
3126 for (x = startx;x+2 <= endx;x+=2)
3128 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3129 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3130 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3131 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3132 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3136 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3137 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3138 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3139 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3140 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3145 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3148 int x, startx = span->startx, endx = span->endx;
3149 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3150 localcolor = _mm_packs_epi32(localcolor, localcolor);
3151 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3152 for (x = startx;x+2 <= endx;x+=2)
3154 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3155 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3156 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3160 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3161 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3162 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3169 void DPSOFTRAST_VertexShader_Generic(void)
3171 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3172 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3173 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3174 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3175 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3178 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3180 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3181 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3182 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3183 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3184 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3185 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3187 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3188 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3189 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3191 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3192 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3195 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3197 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3200 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3202 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3205 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3210 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3211 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3216 void DPSOFTRAST_VertexShader_PostProcess(void)
3218 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3219 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3220 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3223 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3225 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3226 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3227 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3228 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3229 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3230 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3231 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3233 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3234 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3236 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3237 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3239 // TODO: implement saturation
3241 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3243 // TODO: implement gammaramps
3245 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3250 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3252 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3255 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3257 // this is never called (because colormask is off when this shader is used)
3258 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3259 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3260 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3261 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3262 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3267 void DPSOFTRAST_VertexShader_FlatColor(void)
3269 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3270 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3273 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3276 unsigned char * RESTRICT pixelmask = span->pixelmask;
3277 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3278 int x, startx = span->startx, endx = span->endx;
3279 __m128i Color_Ambientm;
3280 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3281 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3282 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3283 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3284 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3285 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3286 pixel = buffer_FragColorbgra8;
3287 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3288 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3289 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3290 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3291 for (x = startx;x < endx;x++)
3294 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3297 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3298 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3299 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3300 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3306 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3307 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3308 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3310 if (pixel == buffer_FragColorbgra8)
3311 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3317 void DPSOFTRAST_VertexShader_VertexColor(void)
3319 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3320 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3321 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3324 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3327 unsigned char * RESTRICT pixelmask = span->pixelmask;
3328 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3329 int x, startx = span->startx, endx = span->endx;
3330 __m128i Color_Ambientm, Color_Diffusem;
3332 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3333 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3334 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3335 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3336 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3337 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3338 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3339 pixel = buffer_FragColorbgra8;
3340 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3341 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3342 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3343 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3344 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3345 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3346 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3347 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3348 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3349 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3350 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3351 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3352 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3353 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3355 __m128i color, mod, pix;
3356 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3359 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3360 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3361 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3362 data = _mm_add_ps(data, slope);
3363 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3364 data = _mm_add_ps(data, slope);
3365 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3366 data = _mm_add_ps(data, slope);
3367 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3368 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3369 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3370 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3371 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3372 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3378 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3379 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3380 mod = _mm_packs_epi32(mod, mod);
3381 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3382 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3384 if (pixel == buffer_FragColorbgra8)
3385 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3391 void DPSOFTRAST_VertexShader_Lightmap(void)
3393 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3394 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3395 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3398 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3401 unsigned char * RESTRICT pixelmask = span->pixelmask;
3402 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3403 int x, startx = span->startx, endx = span->endx;
3404 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3405 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3406 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3407 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3408 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3409 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3410 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3411 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3412 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3413 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3414 pixel = buffer_FragColorbgra8;
3415 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3416 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3417 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3418 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3419 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3420 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3421 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3422 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3424 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3425 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3426 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3427 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3428 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3429 for (x = startx;x < endx;x++)
3431 __m128i color, lightmap, glow, pix;
3432 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3435 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3436 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3437 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3438 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3439 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3440 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3441 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3442 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3443 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3444 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3450 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3451 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3452 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3453 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3454 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3455 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3460 for (x = startx;x < endx;x++)
3462 __m128i color, lightmap, pix;
3463 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3466 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3467 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3468 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3469 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3470 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3471 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3472 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3478 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3479 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3480 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3481 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3484 if (pixel == buffer_FragColorbgra8)
3485 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3490 void DPSOFTRAST_VertexShader_LightDirection(void);
3491 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3493 void DPSOFTRAST_VertexShader_FakeLight(void)
3495 DPSOFTRAST_VertexShader_LightDirection();
3498 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3500 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3505 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3507 DPSOFTRAST_VertexShader_LightDirection();
3508 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3511 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3513 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3518 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3520 DPSOFTRAST_VertexShader_LightDirection();
3521 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3524 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3526 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3531 void DPSOFTRAST_VertexShader_LightDirection(void)
3534 int numvertices = dpsoftrast.numvertices;
3536 float LightVector[4];
3537 float EyePosition[4];
3538 float EyeVectorModelSpace[4];
3544 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3545 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3546 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3547 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3548 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3549 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3550 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3551 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3552 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3553 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3554 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3555 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3556 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3557 for (i = 0;i < numvertices;i++)
3559 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3560 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3561 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3562 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3563 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3564 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3565 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3566 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3567 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3568 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3569 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3570 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3571 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3572 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3573 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3574 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3575 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3576 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3577 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3578 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3579 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3580 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3581 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3582 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3583 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3584 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3585 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3586 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3587 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3589 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3592 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3593 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3594 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3595 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3596 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3597 #define DPSOFTRAST_Vector3Normalize(v)\
3600 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3611 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3613 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3614 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3615 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3616 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3617 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3618 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3619 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3620 unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3621 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3622 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3623 int x, startx = span->startx, endx = span->endx;
3624 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3625 float LightVectordata[4];
3626 float LightVectorslope[4];
3627 float EyeVectordata[4];
3628 float EyeVectorslope[4];
3629 float VectorSdata[4];
3630 float VectorSslope[4];
3631 float VectorTdata[4];
3632 float VectorTslope[4];
3633 float VectorRdata[4];
3634 float VectorRslope[4];
3636 float diffusetex[4];
3638 float surfacenormal[4];
3639 float lightnormal[4];
3640 float lightnormal_modelspace[4];
3642 float specularnormal[4];
3645 float SpecularPower;
3647 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3648 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3649 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3650 Color_Glow[3] = 0.0f;
3651 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3652 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3653 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3654 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3655 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3656 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3657 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3658 Color_Pants[3] = 0.0f;
3659 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3660 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3661 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3662 Color_Shirt[3] = 0.0f;
3663 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3664 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3665 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3667 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3668 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3670 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3672 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3674 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3676 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3677 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3678 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3679 Color_Diffuse[3] = 0.0f;
3680 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3681 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3682 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3683 LightColor[3] = 0.0f;
3684 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3685 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3686 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3687 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3688 Color_Specular[3] = 0.0f;
3689 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3690 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3691 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3693 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3695 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3696 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3697 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3698 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3699 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3701 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3703 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3704 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3706 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3708 // nothing of this needed
3712 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3715 for (x = startx;x < endx;x++)
3718 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3719 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3720 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3721 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3722 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3724 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3725 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3726 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3727 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3729 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3730 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3731 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3732 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3733 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3734 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3735 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3736 DPSOFTRAST_Vector3Normalize(surfacenormal);
3738 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3740 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3741 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3742 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3743 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3745 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3746 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3747 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3748 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3750 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3751 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3752 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3753 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3755 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3756 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3757 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3758 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3760 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3761 DPSOFTRAST_Vector3Normalize(lightnormal);
3763 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3765 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3766 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3767 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3768 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3771 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3773 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3774 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3775 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3777 float f = 1.0f / 256.0f;
3778 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3779 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3780 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3783 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3785 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3786 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3787 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3788 DPSOFTRAST_Vector3Normalize(lightnormal);
3790 LightColor[0] = 1.0;
3791 LightColor[1] = 1.0;
3792 LightColor[2] = 1.0;
3796 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3797 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3798 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3799 DPSOFTRAST_Vector3Normalize(lightnormal);
3802 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3804 if(thread->shader_exactspecularmath)
3806 // reflect lightnormal at surfacenormal, take the negative of that
3807 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3809 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3810 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3811 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3812 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3814 // dot of this and normalize(EyeVectorFogDepth.xyz)
3815 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3816 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3817 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3818 DPSOFTRAST_Vector3Normalize(eyenormal);
3820 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3824 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3825 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3826 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3827 DPSOFTRAST_Vector3Normalize(eyenormal);
3829 specularnormal[0] = lightnormal[0] + eyenormal[0];
3830 specularnormal[1] = lightnormal[1] + eyenormal[1];
3831 specularnormal[2] = lightnormal[2] + eyenormal[2];
3832 DPSOFTRAST_Vector3Normalize(specularnormal);
3834 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3837 specular = pow(specular, SpecularPower * glosstex[3]);
3838 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3840 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3841 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3842 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3843 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3847 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3848 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3849 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3850 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3853 buffer_FragColorbgra8[x*4+0] = d[0];
3854 buffer_FragColorbgra8[x*4+1] = d[1];
3855 buffer_FragColorbgra8[x*4+2] = d[2];
3856 buffer_FragColorbgra8[x*4+3] = d[3];
3859 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3861 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3862 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3863 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3864 Color_Diffuse[3] = 0.0f;
3865 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3866 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3867 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3868 LightColor[3] = 0.0f;
3869 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3871 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3873 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3874 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3875 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3876 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3877 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3879 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3881 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3882 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3884 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3886 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3890 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3893 for (x = startx;x < endx;x++)
3896 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3897 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3898 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3899 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3900 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3901 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3902 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3903 DPSOFTRAST_Vector3Normalize(surfacenormal);
3905 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3907 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3908 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3909 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3910 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3912 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3913 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3914 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3915 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3917 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3918 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3919 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3920 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3922 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3923 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3924 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3925 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3927 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3928 DPSOFTRAST_Vector3Normalize(lightnormal);
3930 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3932 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3933 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3934 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3935 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3938 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3940 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3941 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3942 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3944 float f = 1.0f / 256.0f;
3945 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3946 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3947 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3950 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3952 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3953 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3954 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3955 DPSOFTRAST_Vector3Normalize(lightnormal);
3957 LightColor[0] = 1.0;
3958 LightColor[1] = 1.0;
3959 LightColor[2] = 1.0;
3963 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3964 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3965 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3966 DPSOFTRAST_Vector3Normalize(lightnormal);
3969 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3970 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3972 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3973 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3974 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3975 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3979 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3980 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3981 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3982 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3984 buffer_FragColorbgra8[x*4+0] = d[0];
3985 buffer_FragColorbgra8[x*4+1] = d[1];
3986 buffer_FragColorbgra8[x*4+2] = d[2];
3987 buffer_FragColorbgra8[x*4+3] = d[3];
3992 for (x = startx;x < endx;x++)
3995 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3996 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3997 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3998 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4000 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4002 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4003 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4004 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4005 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4009 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4010 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4011 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4012 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4014 buffer_FragColorbgra8[x*4+0] = d[0];
4015 buffer_FragColorbgra8[x*4+1] = d[1];
4016 buffer_FragColorbgra8[x*4+2] = d[2];
4017 buffer_FragColorbgra8[x*4+3] = d[3];
4020 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4025 void DPSOFTRAST_VertexShader_LightSource(void)
4028 int numvertices = dpsoftrast.numvertices;
4029 float LightPosition[4];
4030 float LightVector[4];
4031 float LightVectorModelSpace[4];
4032 float EyePosition[4];
4033 float EyeVectorModelSpace[4];
4039 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4040 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4041 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4042 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4043 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4044 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4045 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4046 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4047 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4048 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4049 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4050 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4051 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4052 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4053 for (i = 0;i < numvertices;i++)
4055 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4056 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4057 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4058 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4059 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4060 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4061 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4062 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4063 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4064 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4065 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4066 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4067 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4068 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4069 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4070 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4071 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4072 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
4073 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4074 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4075 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4076 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4077 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4078 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4079 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4080 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4081 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4082 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4083 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4084 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4085 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4086 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4088 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4089 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4092 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4095 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4096 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4097 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4098 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4099 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4100 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4101 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4102 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4103 int x, startx = span->startx, endx = span->endx;
4104 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4105 float CubeVectordata[4];
4106 float CubeVectorslope[4];
4107 float LightVectordata[4];
4108 float LightVectorslope[4];
4109 float EyeVectordata[4];
4110 float EyeVectorslope[4];
4112 float diffusetex[4];
4114 float surfacenormal[4];
4115 float lightnormal[4];
4117 float specularnormal[4];
4120 float SpecularPower;
4121 float CubeVector[4];
4124 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4125 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4126 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4127 Color_Glow[3] = 0.0f;
4128 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4129 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4130 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4131 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4132 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4133 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4134 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4135 Color_Diffuse[3] = 0.0f;
4136 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4137 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4138 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4139 Color_Specular[3] = 0.0f;
4140 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4141 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4142 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4143 Color_Pants[3] = 0.0f;
4144 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4145 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4146 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4147 Color_Shirt[3] = 0.0f;
4148 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4149 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4150 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4151 LightColor[3] = 0.0f;
4152 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4153 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4154 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4155 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4156 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4157 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4158 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4159 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4161 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4162 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4164 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4165 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4166 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4168 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4169 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4170 for (x = startx;x < endx;x++)
4173 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4174 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4175 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4176 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4177 if (attenuation < 0.01f)
4179 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4181 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4182 if (attenuation < 0.01f)
4186 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4187 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4188 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4189 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4190 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4192 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4193 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4194 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4195 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4197 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4198 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4199 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4200 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4201 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4202 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4203 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4204 DPSOFTRAST_Vector3Normalize(surfacenormal);
4206 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4207 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4208 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4209 DPSOFTRAST_Vector3Normalize(lightnormal);
4211 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4213 if(thread->shader_exactspecularmath)
4215 // reflect lightnormal at surfacenormal, take the negative of that
4216 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4218 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4219 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4220 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4221 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4223 // dot of this and normalize(EyeVectorFogDepth.xyz)
4224 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4225 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4226 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4227 DPSOFTRAST_Vector3Normalize(eyenormal);
4229 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4233 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4234 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4235 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4236 DPSOFTRAST_Vector3Normalize(eyenormal);
4238 specularnormal[0] = lightnormal[0] + eyenormal[0];
4239 specularnormal[1] = lightnormal[1] + eyenormal[1];
4240 specularnormal[2] = lightnormal[2] + eyenormal[2];
4241 DPSOFTRAST_Vector3Normalize(specularnormal);
4243 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4245 specular = pow(specular, SpecularPower * glosstex[3]);
4247 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4249 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4250 attenuation *= (1.0f / 255.0f);
4251 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4252 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4253 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4254 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4258 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4259 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4260 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4261 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4263 buffer_FragColorbgra8[x*4+0] = d[0];
4264 buffer_FragColorbgra8[x*4+1] = d[1];
4265 buffer_FragColorbgra8[x*4+2] = d[2];
4266 buffer_FragColorbgra8[x*4+3] = d[3];
4269 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4271 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4272 for (x = startx;x < endx;x++)
4275 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4276 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4277 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4278 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4279 if (attenuation < 0.01f)
4281 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4283 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4284 if (attenuation < 0.01f)
4288 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4289 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4290 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4291 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4292 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4294 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4295 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4296 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4297 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4299 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4300 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4301 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4302 DPSOFTRAST_Vector3Normalize(surfacenormal);
4304 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4305 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4306 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4307 DPSOFTRAST_Vector3Normalize(lightnormal);
4309 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4310 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4312 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4313 attenuation *= (1.0f / 255.0f);
4314 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4315 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4316 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4317 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4321 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4322 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4323 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4324 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4326 buffer_FragColorbgra8[x*4+0] = d[0];
4327 buffer_FragColorbgra8[x*4+1] = d[1];
4328 buffer_FragColorbgra8[x*4+2] = d[2];
4329 buffer_FragColorbgra8[x*4+3] = d[3];
4334 for (x = startx;x < endx;x++)
4337 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4338 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4339 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4340 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4341 if (attenuation < 0.01f)
4343 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4345 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4346 if (attenuation < 0.01f)
4350 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4351 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4352 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4353 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4354 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4356 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4357 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4358 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4359 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4361 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4363 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4364 attenuation *= (1.0f / 255.0f);
4365 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4366 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4367 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4368 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4372 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4373 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4374 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4375 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4377 buffer_FragColorbgra8[x*4+0] = d[0];
4378 buffer_FragColorbgra8[x*4+1] = d[1];
4379 buffer_FragColorbgra8[x*4+2] = d[2];
4380 buffer_FragColorbgra8[x*4+3] = d[3];
4383 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4389 void DPSOFTRAST_VertexShader_Refraction(void)
4391 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4392 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4393 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4396 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4398 // DIRTY TRICK: only do sideways displacement. Not correct, but cheaper and thus better for SW.
4400 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4402 int x, startx = span->startx, endx = span->endx;
4405 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4406 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4409 float ModelViewProjectionPositiondata[4];
4410 float ModelViewProjectionPositionslope[4];
4413 float ScreenScaleRefractReflect[2];
4414 float ScreenCenterRefractReflect[2];
4415 float DistortScaleRefractReflect[2];
4416 float RefractColor[4];
4418 const unsigned char * RESTRICT pixelbase;
4419 const unsigned char * RESTRICT pixel[4];
4420 DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4421 if(!texture) return;
4422 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[0][0];
4425 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4426 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4429 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
4432 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4433 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4434 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4435 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4436 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4437 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4438 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4439 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4440 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4441 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4444 for (x = startx;x < endx;x++)
4446 float SafeScreenTexCoord[2];
4447 float ScreenTexCoord[2];
4454 // " vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4455 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4457 // " vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4458 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4459 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4461 // " vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4462 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4463 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4464 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4465 DPSOFTRAST_Vector3Normalize(v);
4466 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4467 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4469 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4470 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4472 unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<12) - 2048, ScreenTexCoord[1] * (texture->mipmap[0][3]<<12) - 2048};
4473 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
4474 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
4475 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
4476 int tci[2] = { tc[0]>>12, tc[1]>>12 };
4477 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4478 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4479 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4480 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4481 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4482 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4483 pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci1[0]);
4484 pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci[0]);
4485 pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci1[0]);
4486 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
4487 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
4488 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
4492 int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2], ScreenTexCoord[1] * texture->mipmap[0][3] };
4493 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4494 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4495 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4501 //p = (int) bound(startx, x + (ScreenTexCoord[0] - SafeScreenTexCoord[0]) / (ModelViewProjectionPositionslope[0]*z), endx-1);
4502 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4503 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4504 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4505 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4508 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4513 void DPSOFTRAST_VertexShader_Water(void)
4515 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4519 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4522 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4523 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4524 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4525 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4526 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4531 void DPSOFTRAST_VertexShader_ShowDepth(void)
4533 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4536 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4539 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4540 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4541 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4542 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4543 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4548 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4550 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4553 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4556 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4557 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4558 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4559 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4560 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4565 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4567 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4570 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4573 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4574 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4575 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4576 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4577 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4582 typedef struct DPSOFTRAST_ShaderModeInfo_s
4585 void (*Vertex)(void);
4586 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4587 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4588 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4590 DPSOFTRAST_ShaderModeInfo;
4592 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4594 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4595 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4596 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4597 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4598 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4599 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4600 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4601 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4602 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4603 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4604 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4605 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4606 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4607 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4608 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4609 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4612 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4619 // unsigned int *colorpixel;
4620 unsigned int *depthpixel;
4626 DPSOFTRAST_State_Triangle *triangle;
4627 DPSOFTRAST_State_Span *span;
4628 unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4629 for (i = 0; i < thread->numspans; i++)
4631 span = &thread->spans[i];
4632 triangle = &thread->triangles[span->triangle];
4633 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4635 wslope = triangle->w[0];
4636 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4637 depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4638 depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4639 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4640 startx = span->startx;
4642 switch(thread->fb_depthfunc)
4645 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4646 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4647 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4648 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4649 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4650 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4651 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4653 //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4654 //for (x = startx;x < endx;x++)
4655 // colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4656 // if there is no color buffer, skip pixel shader
4657 while (startx < endx && !pixelmask[startx])
4659 while (endx > startx && !pixelmask[endx-1])
4662 continue; // no pixels to fill
4663 span->pixelmask = pixelmask;
4664 span->startx = startx;
4666 // run pixel shader if appropriate
4667 // do this before running depthmask code, to allow the pixelshader
4668 // to clear pixelmask values for alpha testing
4669 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4670 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4671 if (thread->depthmask)
4672 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4678 // no depth testing means we're just dealing with color...
4679 // if there is no color buffer, skip pixel shader
4680 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4682 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4683 span->pixelmask = pixelmask;
4684 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4688 thread->numspans = 0;
4691 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4693 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4696 int cullface = thread->cullface;
4697 int minx, maxx, miny, maxy;
4698 int miny1, maxy1, miny2, maxy2;
4699 __m128i fbmin, fbmax;
4700 __m128 viewportcenter, viewportscale;
4701 int firstvertex = command->firstvertex;
4702 int numvertices = command->numvertices;
4703 int numtriangles = command->numtriangles;
4704 const int *element3i = command->element3i;
4705 const unsigned short *element3s = command->element3s;
4706 int clipped = command->clipped;
4713 int starty, endy, bandy;
4717 __m128 triangleedge1, triangleedge2, trianglenormal;
4720 DPSOFTRAST_State_Triangle *triangle;
4721 DPSOFTRAST_Texture *texture;
4722 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4723 miny = thread->fb_scissor[1];
4724 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4725 miny1 = bound(miny, thread->miny1, maxy);
4726 maxy1 = bound(miny, thread->maxy1, maxy);
4727 miny2 = bound(miny, thread->miny2, maxy);
4728 maxy2 = bound(miny, thread->maxy2, maxy);
4729 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4731 if (!ATOMIC_DECREMENT(command->refcount))
4733 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4734 MM_FREE(command->arrays);
4738 minx = thread->fb_scissor[0];
4739 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4740 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4741 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4742 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4743 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4744 screen[3] = _mm_setzero_ps();
4745 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4746 for (i = 0;i < numtriangles;i++)
4748 const float *screencoord4f = command->arrays;
4749 const float *arrays = screencoord4f + numvertices*4;
4751 // generate the 3 edges of this triangle
4752 // generate spans for the triangle - switch based on left split or right split classification of triangle
4755 e[0] = element3s[i*3+0] - firstvertex;
4756 e[1] = element3s[i*3+1] - firstvertex;
4757 e[2] = element3s[i*3+2] - firstvertex;
4761 e[0] = element3i[i*3+0] - firstvertex;
4762 e[1] = element3i[i*3+1] - firstvertex;
4763 e[2] = element3i[i*3+2] - firstvertex;
4772 #define SKIPBACKFACE \
4773 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4774 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4775 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4776 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4777 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4781 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4785 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4790 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4791 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4793 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4794 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4796 #define CLIPPEDVERTEXCOPY(k,p1) \
4797 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4799 #define GENATTRIBCOPY(attrib, p1) \
4800 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4801 #define GENATTRIBLERP(attrib, p1, p2) \
4803 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4804 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4806 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4810 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4811 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4812 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4813 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4814 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4815 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4816 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4822 // calculate distance from nearplane
4823 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4824 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4825 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4826 if (clipdist[0] >= 0.0f)
4828 if (clipdist[1] >= 0.0f)
4830 if (clipdist[2] >= 0.0f)
4833 // triangle is entirely in front of nearplane
4834 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4841 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4849 if (clipdist[2] >= 0.0f)
4851 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4858 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4865 else if (clipdist[1] >= 0.0f)
4867 if (clipdist[2] >= 0.0f)
4869 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4876 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4882 else if (clipdist[2] >= 0.0f)
4884 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4889 else continue; // triangle is entirely behind nearplane
4892 // calculate integer y coords for triangle points
4893 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4894 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4895 screenmin = _mm_min_epi16(screeni, screenir),
4896 screenmax = _mm_max_epi16(screeni, screenir);
4897 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4898 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4899 screenmin = _mm_max_epi16(screenmin, fbmin);
4900 screenmax = _mm_min_epi16(screenmax, fbmax);
4901 // skip offscreen triangles
4902 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4904 starty = _mm_extract_epi16(screenmin, 1);
4905 endy = _mm_extract_epi16(screenmax, 1)+1;
4906 if (starty >= maxy1 && endy <= miny2)
4908 screeny = _mm_srai_epi32(screeni, 16);
4911 triangle = &thread->triangles[thread->numtriangles];
4913 // calculate attribute plans for triangle data...
4914 // okay, this triangle is going to produce spans, we'd better project
4915 // the interpolants now (this is what gives perspective texturing),
4916 // this consists of simply multiplying all arrays by the W coord
4917 // (which is basically 1/Z), which will be undone per-pixel
4918 // (multiplying by Z again) to get the perspective-correct array
4921 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4922 __m128 mipedgescale, mipdensity;
4923 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4924 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4925 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4926 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4927 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4928 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4929 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4930 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4931 attribedge1 = _mm_sub_ss(w0, w1);
4932 attribedge2 = _mm_sub_ss(w2, w1);
4933 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4934 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4935 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4936 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4937 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4938 _mm_store_ss(&triangle->w[0], attribxslope);
4939 _mm_store_ss(&triangle->w[1], attribyslope);
4940 _mm_store_ss(&triangle->w[2], attriborigin);
4941 mipedgescale = _mm_setzero_ps();
4942 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4944 __m128 attrib0, attrib1, attrib2;
4945 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4946 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4948 arrays += numvertices*4;
4949 GENATTRIBS(attrib0, attrib1, attrib2);
4950 attriborigin = _mm_mul_ps(attrib1, w1);
4951 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4952 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4953 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4954 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4955 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4956 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
4957 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
4958 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
4959 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4961 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4962 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4963 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4964 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4968 memset(triangle->mip, 0, sizeof(triangle->mip));
4969 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4971 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4972 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4974 texture = thread->texbound[texunit];
4975 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4977 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4978 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4979 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4980 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4981 // this will be multiplied in the texturing routine by the texture resolution
4982 y = _mm_cvtss_si32(mipdensity);
4985 y = (int)(log((float)y)*0.5f/M_LN2);
4986 if (y > texture->mipmaps - 1)
4987 y = texture->mipmaps - 1;
4988 triangle->mip[texunit] = y;
4994 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4997 __m128 xcoords, xslope;
4998 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4999 int yccmask = _mm_movemask_epi8(ycc);
5000 int edge0p, edge0n, edge1p, edge1n;
5007 case 0xFFFF: /*0000*/ y = endy; continue;
5008 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5009 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5010 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5011 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5012 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5013 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5014 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5015 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5016 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5017 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5018 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5019 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5020 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5021 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5022 case 0x0000: /*1111*/ y++; continue;
5030 case 0xFFFF: /*000*/ y = endy; continue;
5031 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5032 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5033 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5034 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5035 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5036 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5037 case 0x0000: /*111*/ y++; continue;
5040 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5041 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5042 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5043 nexty = _mm_extract_epi16(ycc, 0);
5044 if (nexty >= bandy) nexty = bandy-1;
5045 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5046 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5047 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5048 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5049 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5050 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5052 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5053 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5055 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
5057 int startx, endx, offset;
5058 startx = _mm_cvtss_si32(xcoords);
5059 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5062 if (startx < 0) startx = 0;
5063 startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
5065 if (endx > maxx) endx = maxx;
5066 if (startx >= endx) continue;
5067 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5069 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5070 span->triangle = thread->numtriangles;
5073 span->startx = max(minx - offset, 0);
5074 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5075 if (span->startx >= span->endx)
5077 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5078 DPSOFTRAST_Draw_ProcessSpans(thread);
5083 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5085 DPSOFTRAST_Draw_ProcessSpans(thread);
5086 thread->numtriangles = 0;
5090 if (!ATOMIC_DECREMENT(command->refcount))
5092 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5093 MM_FREE(command->arrays);
5096 if (thread->numspans > 0 || thread->numtriangles > 0)
5098 DPSOFTRAST_Draw_ProcessSpans(thread);
5099 thread->numtriangles = 0;
5104 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5108 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5109 int datasize = 2*numvertices*sizeof(float[4]);
5110 DPSOFTRAST_Command_Draw *command;
5111 unsigned char *data;
5112 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5114 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5115 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5117 datasize += numvertices*sizeof(float[4]);
5120 datasize += numtriangles*sizeof(unsigned short[3]);
5122 datasize += numtriangles*sizeof(int[3]);
5123 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5124 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5126 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5127 data = (unsigned char *)MM_CALLOC(datasize, 1);
5131 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5132 data = (unsigned char *)command + commandsize;
5134 command->firstvertex = firstvertex;
5135 command->numvertices = numvertices;
5136 command->numtriangles = numtriangles;
5137 command->arrays = (float *)data;
5138 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5139 dpsoftrast.firstvertex = firstvertex;
5140 dpsoftrast.numvertices = numvertices;
5141 dpsoftrast.screencoord4f = (float *)data;
5142 data += numvertices*sizeof(float[4]);
5143 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5144 data += numvertices*sizeof(float[4]);
5145 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5147 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5148 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5150 dpsoftrast.post_array4f[j] = (float *)data;
5151 data += numvertices*sizeof(float[4]);
5153 command->element3i = NULL;
5154 command->element3s = NULL;
5157 command->element3s = (unsigned short *)data;
5158 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5162 command->element3i = (int *)data;
5163 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5168 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5170 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5171 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5172 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5173 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5174 if (command->starty >= command->endy)
5176 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5177 MM_FREE(command->arrays);
5178 DPSOFTRAST_UndoCommand(command->commandsize);
5181 command->clipped = dpsoftrast.drawclipped;
5182 command->refcount = dpsoftrast.numthreads;
5184 if (dpsoftrast.usethreads)
5187 DPSOFTRAST_Draw_SyncCommands();
5188 for (i = 0; i < dpsoftrast.numthreads; i++)
5190 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5191 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5192 Thread_CondSignal(thread->drawcond);
5197 DPSOFTRAST_Draw_FlushThreads();
5201 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5202 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5204 thread->validate |= DPSOFTRAST_VALIDATE_FB;
5206 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5208 DPSOFTRAST_Command_SetRenderTargets *command;
5209 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5210 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5211 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5213 dpsoftrast.fb_width = width;
5214 dpsoftrast.fb_height = height;
5215 dpsoftrast.fb_depthpixels = depthpixels;
5216 dpsoftrast.fb_colorpixels[0] = colorpixels0;
5217 dpsoftrast.fb_colorpixels[1] = colorpixels1;
5218 dpsoftrast.fb_colorpixels[2] = colorpixels2;
5219 dpsoftrast.fb_colorpixels[3] = colorpixels3;
5220 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5221 command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5222 command->width = width;
5223 command->height = height;
5226 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5228 int commandoffset = thread->commandoffset;
5229 while (commandoffset != endoffset)
5231 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5232 switch (command->opcode)
5234 #define INTERPCOMMAND(name) \
5235 case DPSOFTRAST_OPCODE_##name : \
5236 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5237 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5238 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5239 commandoffset = 0; \
5241 INTERPCOMMAND(Viewport)
5242 INTERPCOMMAND(ClearColor)
5243 INTERPCOMMAND(ClearDepth)
5244 INTERPCOMMAND(ColorMask)
5245 INTERPCOMMAND(DepthTest)
5246 INTERPCOMMAND(ScissorTest)
5247 INTERPCOMMAND(Scissor)
5248 INTERPCOMMAND(BlendFunc)
5249 INTERPCOMMAND(BlendSubtract)
5250 INTERPCOMMAND(DepthMask)
5251 INTERPCOMMAND(DepthFunc)
5252 INTERPCOMMAND(DepthRange)
5253 INTERPCOMMAND(PolygonOffset)
5254 INTERPCOMMAND(CullFace)
5255 INTERPCOMMAND(AlphaTest)
5256 INTERPCOMMAND(AlphaFunc)
5257 INTERPCOMMAND(SetTexture)
5258 INTERPCOMMAND(SetShader)
5259 INTERPCOMMAND(Uniform4f)
5260 INTERPCOMMAND(UniformMatrix4f)
5261 INTERPCOMMAND(Uniform1i)
5262 INTERPCOMMAND(SetRenderTargets)
5264 case DPSOFTRAST_OPCODE_Draw:
5265 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5266 commandoffset += command->commandsize;
5267 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5269 thread->commandoffset = commandoffset;
5272 case DPSOFTRAST_OPCODE_Reset:
5277 thread->commandoffset = commandoffset;
5280 static int DPSOFTRAST_Draw_Thread(void *data)
5282 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5283 while(thread->index >= 0)
5285 if (thread->commandoffset != dpsoftrast.drawcommand)
5287 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5291 Thread_LockMutex(thread->drawmutex);
5292 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5294 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5295 thread->starving = true;
5296 Thread_CondWait(thread->drawcond, thread->drawmutex);
5297 thread->starving = false;
5299 Thread_UnlockMutex(thread->drawmutex);
5305 static void DPSOFTRAST_Draw_FlushThreads(void)
5307 DPSOFTRAST_State_Thread *thread;
5309 DPSOFTRAST_Draw_SyncCommands();
5310 if (dpsoftrast.usethreads)
5312 for (i = 0; i < dpsoftrast.numthreads; i++)
5314 thread = &dpsoftrast.threads[i];
5315 if (thread->commandoffset != dpsoftrast.drawcommand)
5317 Thread_LockMutex(thread->drawmutex);
5318 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5319 Thread_CondSignal(thread->drawcond);
5320 Thread_UnlockMutex(thread->drawmutex);
5323 for (i = 0; i < dpsoftrast.numthreads; i++)
5325 thread = &dpsoftrast.threads[i];
5326 if (thread->commandoffset != dpsoftrast.drawcommand)
5328 Thread_LockMutex(thread->drawmutex);
5329 if (thread->commandoffset != dpsoftrast.drawcommand)
5331 thread->waiting = true;
5332 Thread_CondWait(thread->waitcond, thread->drawmutex);
5333 thread->waiting = false;
5335 Thread_UnlockMutex(thread->drawmutex);
5341 for (i = 0; i < dpsoftrast.numthreads; i++)
5343 thread = &dpsoftrast.threads[i];
5344 if (thread->commandoffset != dpsoftrast.drawcommand)
5345 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5348 dpsoftrast.commandpool.usedcommands = 0;
5351 void DPSOFTRAST_Flush(void)
5353 DPSOFTRAST_Draw_FlushThreads();
5356 void DPSOFTRAST_Finish(void)
5361 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5371 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5372 dpsoftrast.bigendian = u.b[3];
5373 dpsoftrast.fb_width = width;
5374 dpsoftrast.fb_height = height;
5375 dpsoftrast.fb_depthpixels = depthpixels;
5376 dpsoftrast.fb_colorpixels[0] = colorpixels;
5377 dpsoftrast.fb_colorpixels[1] = NULL;
5378 dpsoftrast.fb_colorpixels[1] = NULL;
5379 dpsoftrast.fb_colorpixels[1] = NULL;
5380 dpsoftrast.viewport[0] = 0;
5381 dpsoftrast.viewport[1] = 0;
5382 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5383 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5384 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5385 dpsoftrast.texture_firstfree = 1;
5386 dpsoftrast.texture_end = 1;
5387 dpsoftrast.texture_max = 0;
5388 dpsoftrast.color[0] = 1;
5389 dpsoftrast.color[1] = 1;
5390 dpsoftrast.color[2] = 1;
5391 dpsoftrast.color[3] = 1;
5392 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5393 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5394 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5395 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5396 for (i = 0; i < dpsoftrast.numthreads; i++)
5398 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5400 thread->cullface = GL_BACK;
5401 thread->colormask[1] = 1;
5402 thread->colormask[2] = 1;
5403 thread->colormask[3] = 1;
5404 thread->blendfunc[0] = GL_ONE;
5405 thread->blendfunc[1] = GL_ZERO;
5406 thread->depthmask = true;
5407 thread->depthtest = true;
5408 thread->depthfunc = GL_LEQUAL;
5409 thread->scissortest = false;
5410 thread->alphatest = false;
5411 thread->alphafunc = GL_GREATER;
5412 thread->alphavalue = 0.5f;
5413 thread->viewport[0] = 0;
5414 thread->viewport[1] = 0;
5415 thread->viewport[2] = dpsoftrast.fb_width;
5416 thread->viewport[3] = dpsoftrast.fb_height;
5417 thread->scissor[0] = 0;
5418 thread->scissor[1] = 0;
5419 thread->scissor[2] = dpsoftrast.fb_width;
5420 thread->scissor[3] = dpsoftrast.fb_height;
5421 thread->depthrange[0] = 0;
5422 thread->depthrange[1] = 1;
5423 thread->polygonoffset[0] = 0;
5424 thread->polygonoffset[1] = 0;
5426 DPSOFTRAST_RecalcThread(thread);
5428 thread->numspans = 0;
5429 thread->numtriangles = 0;
5430 thread->commandoffset = 0;
5431 thread->waiting = false;
5432 thread->starving = false;
5434 thread->validate = -1;
5435 DPSOFTRAST_Validate(thread, -1);
5437 if (dpsoftrast.usethreads)
5439 thread->waitcond = Thread_CreateCond();
5440 thread->drawcond = Thread_CreateCond();
5441 thread->drawmutex = Thread_CreateMutex();
5442 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5448 void DPSOFTRAST_Shutdown(void)
5451 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5453 DPSOFTRAST_State_Thread *thread;
5454 for (i = 0; i < dpsoftrast.numthreads; i++)
5456 thread = &dpsoftrast.threads[i];
5457 Thread_LockMutex(thread->drawmutex);
5459 Thread_CondSignal(thread->drawcond);
5460 Thread_UnlockMutex(thread->drawmutex);
5461 Thread_WaitThread(thread->thread, 0);
5462 Thread_DestroyCond(thread->waitcond);
5463 Thread_DestroyCond(thread->drawcond);
5464 Thread_DestroyMutex(thread->drawmutex);
5467 for (i = 0;i < dpsoftrast.texture_end;i++)
5468 if (dpsoftrast.texture[i].bytes)
5469 MM_FREE(dpsoftrast.texture[i].bytes);
5470 if (dpsoftrast.texture)
5471 free(dpsoftrast.texture);
5472 if (dpsoftrast.threads)
5473 MM_FREE(dpsoftrast.threads);
5474 memset(&dpsoftrast, 0, sizeof(dpsoftrast));