3 #define _USE_MATH_DEFINES
7 #include "dpsoftrast.h"
10 #pragma warning(disable : 4324)
14 typedef qboolean bool;
18 #define ATOMIC_SIZE 32
21 #if defined(__APPLE__)
22 #include <libkern/OSAtomic.h>
23 #define ALIGN(var) var __attribute__((__aligned__(16)))
24 #define ATOMIC(var) var __attribute__((__aligned__(32)))
25 #define MEMORY_BARRIER (_mm_sfence())
26 #define ATOMIC_COUNTER volatile int32_t
27 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30 #elif defined(__GNUC__)
31 #define ALIGN(var) var __attribute__((__aligned__(16)))
32 #define ATOMIC(var) var __attribute__((__aligned__(32)))
33 #define MEMORY_BARRIER (_mm_sfence())
34 //(__sync_synchronize())
35 #define ATOMIC_COUNTER volatile int
36 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
37 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
38 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
39 #elif defined(_MSC_VER)
40 #define ALIGN(var) __declspec(align(16)) var
41 #define ATOMIC(var) __declspec(align(32)) var
42 #define MEMORY_BARRIER (_mm_sfence())
44 #define ATOMIC_COUNTER volatile LONG
45 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
46 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
47 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
52 #define ALIGN(var) var
55 #define ATOMIC(var) var
57 #ifndef MEMORY_BARRIER
58 #define MEMORY_BARRIER ((void)0)
60 #ifndef ATOMIC_COUNTER
61 #define ATOMIC_COUNTER int
63 #ifndef ATOMIC_INCREMENT
64 #define ATOMIC_INCREMENT(counter) (++(counter))
66 #ifndef ATOMIC_DECREMENT
67 #define ATOMIC_DECREMENT(counter) (--(counter))
70 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
74 #include <emmintrin.h>
76 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
78 static void *MM_CALLOC(size_t nmemb, size_t size)
80 void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
81 if (ptr != NULL) memset(ptr, 0, nmemb*size);
85 #define MM_FREE _mm_free
87 #define MM_MALLOC(size) malloc(size)
88 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
92 typedef enum DPSOFTRAST_ARRAY_e
94 DPSOFTRAST_ARRAY_POSITION,
95 DPSOFTRAST_ARRAY_COLOR,
96 DPSOFTRAST_ARRAY_TEXCOORD0,
97 DPSOFTRAST_ARRAY_TEXCOORD1,
98 DPSOFTRAST_ARRAY_TEXCOORD2,
99 DPSOFTRAST_ARRAY_TEXCOORD3,
100 DPSOFTRAST_ARRAY_TEXCOORD4,
101 DPSOFTRAST_ARRAY_TEXCOORD5,
102 DPSOFTRAST_ARRAY_TEXCOORD6,
103 DPSOFTRAST_ARRAY_TEXCOORD7,
104 DPSOFTRAST_ARRAY_TOTAL
108 typedef struct DPSOFTRAST_Texture_s
115 DPSOFTRAST_TEXTURE_FILTER filter;
118 ATOMIC_COUNTER binds;
119 unsigned char *bytes;
120 int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
124 #define COMMAND_SIZE ALIGN_SIZE
125 #define COMMAND_ALIGN(var) ALIGN(var)
127 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
129 unsigned char opcode;
130 unsigned short commandsize;
134 enum { DPSOFTRAST_OPCODE_Reset = 0 };
136 #define DEFCOMMAND(opcodeval, name, fields) \
137 enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
138 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
140 unsigned char opcode; \
141 unsigned short commandsize; \
143 } DPSOFTRAST_Command_##name );
145 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
146 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
148 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
152 ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
154 DPSOFTRAST_State_Command_Pool);
156 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
158 unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
160 ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
162 DPSOFTRAST_State_Triangle);
164 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
165 slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
166 data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
167 _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
168 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
170 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
171 slope[0] = (triangle)->attribs[arrayindex][0][0]; \
172 slope[1] = (triangle)->attribs[arrayindex][0][1]; \
173 slope[2] = (triangle)->attribs[arrayindex][0][2]; \
174 slope[3] = (triangle)->attribs[arrayindex][0][3]; \
175 data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
176 data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
177 data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
178 data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
181 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
183 typedef ALIGN(struct DPSOFTRAST_State_Span_s
185 int triangle; // triangle this span was generated by
186 int x; // framebuffer x coord
187 int y; // framebuffer y coord
188 int startx; // usable range (according to pixelmask)
189 int endx; // usable range (according to pixelmask)
190 unsigned char *pixelmask; // true for pixels that passed depth test, false for others
192 DPSOFTRAST_State_Span);
194 #define DPSOFTRAST_DRAW_MAXSPANS 1024
195 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
197 #define DPSOFTRAST_VALIDATE_FB 1
198 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
199 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
200 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
202 typedef enum DPSOFTRAST_BLENDMODE_e
204 DPSOFTRAST_BLENDMODE_OPAQUE,
205 DPSOFTRAST_BLENDMODE_ALPHA,
206 DPSOFTRAST_BLENDMODE_ADDALPHA,
207 DPSOFTRAST_BLENDMODE_ADD,
208 DPSOFTRAST_BLENDMODE_INVMOD,
209 DPSOFTRAST_BLENDMODE_MUL,
210 DPSOFTRAST_BLENDMODE_MUL2,
211 DPSOFTRAST_BLENDMODE_SUBALPHA,
212 DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
213 DPSOFTRAST_BLENDMODE_INVADD,
214 DPSOFTRAST_BLENDMODE_TOTAL
216 DPSOFTRAST_BLENDMODE;
218 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
237 float polygonoffset[2];
240 int shader_permutation;
241 int shader_exactspecularmath;
243 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
245 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
246 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
248 // DPSOFTRAST_VALIDATE_ flags
251 // derived values (DPSOFTRAST_VALIDATE_FB)
254 ALIGN(float fb_viewportcenter[4]);
255 ALIGN(float fb_viewportscale[4]);
257 // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
260 // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
269 ATOMIC(volatile int commandoffset);
271 volatile bool waiting;
272 volatile bool starving;
279 DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
280 DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
282 DPSOFTRAST_State_Thread);
284 typedef ATOMIC(struct DPSOFTRAST_State_s
288 unsigned int *fb_depthpixels;
289 unsigned int *fb_colorpixels[4];
292 ALIGN(float fb_viewportcenter[4]);
293 ALIGN(float fb_viewportscale[4]);
296 ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
297 int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
299 const float *pointer_vertex3f;
300 const float *pointer_color4f;
301 const unsigned char *pointer_color4ub;
302 const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
305 int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
306 int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
307 DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
311 float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
312 float *screencoord4f;
318 int shader_permutation;
319 int shader_exactspecularmath;
323 int texture_firstfree;
324 DPSOFTRAST_Texture *texture;
329 const char *errorstring;
334 DPSOFTRAST_State_Thread *threads;
336 ATOMIC(volatile int drawcommand);
338 DPSOFTRAST_State_Command_Pool commandpool;
342 DPSOFTRAST_State dpsoftrast;
344 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
345 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
346 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
347 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
348 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
350 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
352 fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
353 fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
354 fb_viewportcenter[3] = 0.5f;
355 fb_viewportcenter[0] = 0.0f;
356 fb_viewportscale[1] = 0.5f * viewport[2];
357 fb_viewportscale[2] = -0.5f * viewport[3];
358 fb_viewportscale[3] = 0.5f;
359 fb_viewportscale[0] = 1.0f;
362 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
364 if (dpsoftrast.interlace)
366 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
367 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
368 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
369 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
373 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
374 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
378 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
380 // calculate framebuffer scissor, viewport, viewport clipped by scissor,
381 // and viewport projection values
384 x1 = thread->scissor[0];
385 x2 = thread->scissor[0] + thread->scissor[2];
386 y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
387 y2 = dpsoftrast.fb_height - thread->scissor[1];
388 if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
390 if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
392 if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
393 thread->fb_scissor[0] = x1;
394 thread->fb_scissor[1] = y1;
395 thread->fb_scissor[2] = x2 - x1;
396 thread->fb_scissor[3] = y2 - y1;
398 DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
399 DPSOFTRAST_RecalcThread(thread);
402 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
404 thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
407 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
409 if (thread->blendsubtract)
411 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
413 #define BLENDFUNC(sfactor, dfactor, blendmode) \
414 case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
415 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
416 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
421 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
423 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
424 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
425 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
426 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
427 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
428 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
429 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
430 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
431 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
432 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
433 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
438 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
440 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
442 mask &= thread->validate;
445 if (mask & DPSOFTRAST_VALIDATE_FB)
447 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
448 DPSOFTRAST_RecalcFB(thread);
450 if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
452 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
453 DPSOFTRAST_RecalcDepthFunc(thread);
455 if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
457 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
458 DPSOFTRAST_RecalcBlendFunc(thread);
462 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
464 if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
465 return &dpsoftrast.texture[index];
469 static void DPSOFTRAST_Texture_Grow(void)
471 DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
472 DPSOFTRAST_State_Thread *thread;
476 // expand texture array as needed
477 if (dpsoftrast.texture_max < 1024)
478 dpsoftrast.texture_max = 1024;
480 dpsoftrast.texture_max *= 2;
481 dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
482 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
483 if (dpsoftrast.texbound[i])
484 dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
485 for (j = 0; j < dpsoftrast.numthreads; j++)
487 thread = &dpsoftrast.threads[j];
488 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
489 if (thread->texbound[i])
490 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
494 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
503 int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
504 int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
505 DPSOFTRAST_Texture *texture;
506 if (width*height*depth < 1)
508 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
511 if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
513 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
518 case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
519 case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
520 case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
522 case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
523 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
525 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
530 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
533 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
535 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
540 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
542 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
545 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
547 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
550 if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
552 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
555 if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
557 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
560 if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
562 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
565 // find first empty slot in texture array
566 for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
567 if (!dpsoftrast.texture[texnum].bytes)
569 dpsoftrast.texture_firstfree = texnum + 1;
570 if (dpsoftrast.texture_max <= texnum)
571 DPSOFTRAST_Texture_Grow();
572 if (dpsoftrast.texture_end <= texnum)
573 dpsoftrast.texture_end = texnum + 1;
574 texture = &dpsoftrast.texture[texnum];
575 memset(texture, 0, sizeof(*texture));
576 texture->flags = flags;
577 texture->width = width;
578 texture->height = height;
579 texture->depth = depth;
580 texture->sides = sides;
592 s = w * h * d * sides * 4;
593 texture->mipmap[mipmaps][0] = size;
594 texture->mipmap[mipmaps][1] = s;
595 texture->mipmap[mipmaps][2] = w;
596 texture->mipmap[mipmaps][3] = h;
597 texture->mipmap[mipmaps][4] = d;
600 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
606 texture->mipmaps = mipmaps;
607 texture->size = size;
609 // allocate the pixels now
610 texture->bytes = (unsigned char *)MM_CALLOC(1, size);
614 void DPSOFTRAST_Texture_Free(int index)
616 DPSOFTRAST_Texture *texture;
617 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
621 MM_FREE(texture->bytes);
622 texture->bytes = NULL;
623 memset(texture, 0, sizeof(*texture));
624 // adjust the free range and used range
625 if (dpsoftrast.texture_firstfree > index)
626 dpsoftrast.texture_firstfree = index;
627 while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
628 dpsoftrast.texture_end--;
630 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
632 int i, x, y, z, w, layer0, layer1, row0, row1;
633 unsigned char *o, *i0, *i1, *i2, *i3;
634 DPSOFTRAST_Texture *texture;
635 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
636 if (texture->mipmaps <= 1)
638 for (i = 1;i < texture->mipmaps;i++)
640 for (z = 0;z < texture->mipmap[i][4];z++)
644 if (layer1 >= texture->mipmap[i-1][4])
645 layer1 = texture->mipmap[i-1][4]-1;
646 for (y = 0;y < texture->mipmap[i][3];y++)
650 if (row1 >= texture->mipmap[i-1][3])
651 row1 = texture->mipmap[i-1][3]-1;
652 o = texture->bytes + texture->mipmap[i ][0] + 4*((texture->mipmap[i ][3] * z + y ) * texture->mipmap[i ][2]);
653 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
654 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
655 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
656 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
657 w = texture->mipmap[i][2];
660 if (texture->mipmap[i-1][2] > 1)
662 // average 3D texture
663 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
665 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
666 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
667 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
668 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
673 // average 3D mipmap with parent width == 1
674 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
676 o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
677 o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
678 o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
679 o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
685 if (texture->mipmap[i-1][2] > 1)
687 // average 2D texture (common case)
688 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
690 o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
691 o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
692 o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
693 o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
698 // 2D texture with parent width == 1
699 o[0] = (i0[0] + i1[0] + 1) >> 1;
700 o[1] = (i0[1] + i1[1] + 1) >> 1;
701 o[2] = (i0[2] + i1[2] + 1) >> 1;
702 o[3] = (i0[3] + i1[3] + 1) >> 1;
709 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
711 DPSOFTRAST_Texture *texture;
713 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
718 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
719 while (blockheight > 0)
721 memcpy(dst, pixels, blockwidth * 4);
722 pixels += blockwidth * 4;
723 dst += texture->mipmap[0][2] * 4;
727 DPSOFTRAST_Texture_CalculateMipmaps(index);
729 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
731 DPSOFTRAST_Texture *texture;
732 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
736 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
737 DPSOFTRAST_Texture_CalculateMipmaps(index);
739 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
741 DPSOFTRAST_Texture *texture;
742 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
743 return texture->mipmap[mip][2];
745 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
747 DPSOFTRAST_Texture *texture;
748 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
749 return texture->mipmap[mip][3];
751 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
753 DPSOFTRAST_Texture *texture;
754 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
755 return texture->mipmap[mip][4];
757 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
759 DPSOFTRAST_Texture *texture;
760 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
763 return texture->bytes + texture->mipmap[mip][0];
765 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
767 DPSOFTRAST_Texture *texture;
768 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
769 if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
771 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
776 texture->filter = filter;
779 static void DPSOFTRAST_Draw_FlushThreads(void);
781 static void DPSOFTRAST_Draw_SyncCommands(void)
783 if(dpsoftrast.usethreads) MEMORY_BARRIER;
784 dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
787 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
789 DPSOFTRAST_State_Thread *thread;
791 int freecommand = dpsoftrast.commandpool.freecommand;
792 int usedcommands = dpsoftrast.commandpool.usedcommands;
793 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
795 DPSOFTRAST_Draw_SyncCommands();
801 for (i = 0; i < dpsoftrast.numthreads; i++)
803 thread = &dpsoftrast.threads[i];
804 commandoffset = freecommand - thread->commandoffset;
805 if (commandoffset < 0)
806 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
807 if (commandoffset > usedcommands)
810 usedcommands = commandoffset;
813 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
815 thread = &dpsoftrast.threads[waitindex];
816 Thread_LockMutex(thread->drawmutex);
817 if (thread->commandoffset != dpsoftrast.drawcommand)
819 thread->waiting = true;
820 if (thread->starving) Thread_CondSignal(thread->drawcond);
821 Thread_CondWait(thread->waitcond, thread->drawmutex);
822 thread->waiting = false;
824 Thread_UnlockMutex(thread->drawmutex);
826 dpsoftrast.commandpool.usedcommands = usedcommands;
829 #define DPSOFTRAST_ALIGNCOMMAND(size) \
830 ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
831 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
832 ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
834 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
836 DPSOFTRAST_Command *command;
837 int freecommand = dpsoftrast.commandpool.freecommand;
838 int usedcommands = dpsoftrast.commandpool.usedcommands;
839 int extra = sizeof(DPSOFTRAST_Command);
840 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
841 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
842 if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
844 if (dpsoftrast.usethreads)
845 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
847 DPSOFTRAST_Draw_FlushThreads();
848 freecommand = dpsoftrast.commandpool.freecommand;
849 usedcommands = dpsoftrast.commandpool.usedcommands;
851 if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
853 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
854 command->opcode = DPSOFTRAST_OPCODE_Reset;
855 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
858 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
859 command->opcode = opcode;
860 command->commandsize = size;
862 if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
864 dpsoftrast.commandpool.freecommand = freecommand;
865 dpsoftrast.commandpool.usedcommands = usedcommands + size;
869 static void DPSOFTRAST_UndoCommand(int size)
871 int freecommand = dpsoftrast.commandpool.freecommand;
872 int usedcommands = dpsoftrast.commandpool.usedcommands;
875 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
876 usedcommands -= size;
877 dpsoftrast.commandpool.freecommand = freecommand;
878 dpsoftrast.commandpool.usedcommands = usedcommands;
881 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
882 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
884 thread->viewport[0] = command->x;
885 thread->viewport[1] = command->y;
886 thread->viewport[2] = command->width;
887 thread->viewport[3] = command->height;
888 thread->validate |= DPSOFTRAST_VALIDATE_FB;
890 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
892 DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
895 command->width = width;
896 command->height = height;
898 dpsoftrast.viewport[0] = x;
899 dpsoftrast.viewport[1] = y;
900 dpsoftrast.viewport[2] = width;
901 dpsoftrast.viewport[3] = height;
902 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
905 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;)
906 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
908 int i, x1, y1, x2, y2, w, h, x, y;
909 int miny1, maxy1, miny2, maxy2;
913 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
914 miny1 = thread->miny1;
915 maxy1 = thread->maxy1;
916 miny2 = thread->miny2;
917 maxy2 = thread->maxy2;
918 x1 = thread->fb_scissor[0];
919 y1 = thread->fb_scissor[1];
920 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
921 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
922 if (y1 < miny1) y1 = miny1;
923 if (y2 > maxy2) y2 = maxy2;
928 // FIXME: honor fb_colormask?
929 c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
930 for (i = 0;i < 4;i++)
932 if (!dpsoftrast.fb_colorpixels[i])
934 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
937 p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
938 for (x = x1;x < x2;x++)
943 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
945 DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
952 DEFCOMMAND(3, ClearDepth, float depth;)
953 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
955 int x1, y1, x2, y2, w, h, x, y;
956 int miny1, maxy1, miny2, maxy2;
960 DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
961 miny1 = thread->miny1;
962 maxy1 = thread->maxy1;
963 miny2 = thread->miny2;
964 maxy2 = thread->maxy2;
965 x1 = thread->fb_scissor[0];
966 y1 = thread->fb_scissor[1];
967 x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
968 y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
969 if (y1 < miny1) y1 = miny1;
970 if (y2 > maxy2) y2 = maxy2;
975 c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
976 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
979 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
980 for (x = x1;x < x2;x++)
984 void DPSOFTRAST_ClearDepth(float d)
986 DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
990 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
991 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
993 thread->colormask[0] = command->r != 0;
994 thread->colormask[1] = command->g != 0;
995 thread->colormask[2] = command->b != 0;
996 thread->colormask[3] = command->a != 0;
997 thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
999 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1001 DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1008 DEFCOMMAND(5, DepthTest, int enable;)
1009 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1011 thread->depthtest = command->enable;
1012 thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1014 void DPSOFTRAST_DepthTest(int enable)
1016 DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1017 command->enable = enable;
1020 DEFCOMMAND(6, ScissorTest, int enable;)
1021 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1023 thread->scissortest = command->enable;
1024 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1026 void DPSOFTRAST_ScissorTest(int enable)
1028 DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1029 command->enable = enable;
1032 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1033 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1035 thread->scissor[0] = command->x;
1036 thread->scissor[1] = command->y;
1037 thread->scissor[2] = command->width;
1038 thread->scissor[3] = command->height;
1039 thread->validate |= DPSOFTRAST_VALIDATE_FB;
1041 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1043 DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1046 command->width = width;
1047 command->height = height;
1050 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1051 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1053 thread->blendfunc[0] = command->sfactor;
1054 thread->blendfunc[1] = command->dfactor;
1055 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1057 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1059 DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1060 command->sfactor = sfactor;
1061 command->dfactor = dfactor;
1064 DEFCOMMAND(9, BlendSubtract, int enable;)
1065 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1067 thread->blendsubtract = command->enable;
1068 thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1070 void DPSOFTRAST_BlendSubtract(int enable)
1072 DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1073 command->enable = enable;
1076 DEFCOMMAND(10, DepthMask, int enable;)
1077 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1079 thread->depthmask = command->enable;
1081 void DPSOFTRAST_DepthMask(int enable)
1083 DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1084 command->enable = enable;
1087 DEFCOMMAND(11, DepthFunc, int func;)
1088 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1090 thread->depthfunc = command->func;
1092 void DPSOFTRAST_DepthFunc(int func)
1094 DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1095 command->func = func;
1098 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1099 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1101 thread->depthrange[0] = command->nearval;
1102 thread->depthrange[1] = command->farval;
1104 void DPSOFTRAST_DepthRange(float nearval, float farval)
1106 DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1107 command->nearval = nearval;
1108 command->farval = farval;
1111 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1112 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1114 thread->polygonoffset[0] = command->alongnormal;
1115 thread->polygonoffset[1] = command->intoview;
1117 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1119 DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1120 command->alongnormal = alongnormal;
1121 command->intoview = intoview;
1124 DEFCOMMAND(14, CullFace, int mode;)
1125 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1127 thread->cullface = command->mode;
1129 void DPSOFTRAST_CullFace(int mode)
1131 DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1132 command->mode = mode;
1135 DEFCOMMAND(15, AlphaTest, int enable;)
1136 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1138 thread->alphatest = command->enable;
1140 void DPSOFTRAST_AlphaTest(int enable)
1142 DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1143 command->enable = enable;
1146 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1147 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1149 thread->alphafunc = command->func;
1150 thread->alphavalue = command->ref;
1152 void DPSOFTRAST_AlphaFunc(int func, float ref)
1154 DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1155 command->func = func;
1159 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1161 dpsoftrast.color[0] = r;
1162 dpsoftrast.color[1] = g;
1163 dpsoftrast.color[2] = b;
1164 dpsoftrast.color[3] = a;
1167 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1169 int outstride = blockwidth * 4;
1170 int instride = dpsoftrast.fb_width * 4;
1173 int bx2 = blockx + blockwidth;
1174 int by2 = blocky + blockheight;
1178 unsigned char *inpixels;
1182 if (bx1 < 0) bx1 = 0;
1183 if (by1 < 0) by1 = 0;
1184 if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1185 if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1187 inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1188 if (dpsoftrast.bigendian)
1190 for (y = by1;y < by2;y++)
1192 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1193 o = (unsigned char *)outpixels + (y - by1) * outstride;
1194 for (x = bx1;x < bx2;x++)
1207 for (y = by1;y < by2;y++)
1209 b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1210 o = (unsigned char *)outpixels + (y - by1) * outstride;
1216 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1220 int tx2 = tx + width;
1221 int ty2 = ty + height;
1224 int sx2 = sx + width;
1225 int sy2 = sy + height;
1235 unsigned int *spixels;
1236 unsigned int *tpixels;
1237 DPSOFTRAST_Texture *texture;
1238 texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1239 if (mip < 0 || mip >= texture->mipmaps) return;
1241 spixels = dpsoftrast.fb_colorpixels[0];
1242 swidth = dpsoftrast.fb_width;
1243 sheight = dpsoftrast.fb_height;
1244 tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1245 twidth = texture->mipmap[mip][2];
1246 theight = texture->mipmap[mip][3];
1247 if (tx1 < 0) tx1 = 0;
1248 if (ty1 < 0) ty1 = 0;
1249 if (tx2 > twidth) tx2 = twidth;
1250 if (ty2 > theight) ty2 = theight;
1251 if (sx1 < 0) sx1 = 0;
1252 if (sy1 < 0) sy1 = 0;
1253 if (sx2 > swidth) sx2 = swidth;
1254 if (sy2 > sheight) sy2 = sheight;
1259 if (tw > sw) tw = sw;
1260 if (th > sh) th = sh;
1261 if (tw < 1 || th < 1)
1263 sy1 = sheight - 1 - sy1;
1264 for (y = 0;y < th;y++)
1265 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1266 if (texture->mipmaps > 1)
1267 DPSOFTRAST_Texture_CalculateMipmaps(index);
1270 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1271 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1273 if (thread->texbound[command->unitnum])
1274 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1275 thread->texbound[command->unitnum] = command->texture;
1277 void DPSOFTRAST_SetTexture(int unitnum, int index)
1279 DPSOFTRAST_Command_SetTexture *command;
1280 DPSOFTRAST_Texture *texture;
1281 if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1283 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1286 texture = DPSOFTRAST_Texture_GetByIndex(index);
1287 if (index && !texture)
1289 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1293 command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1294 command->unitnum = unitnum;
1295 command->texture = texture;
1297 dpsoftrast.texbound[unitnum] = texture;
1298 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1301 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1303 dpsoftrast.pointer_vertex3f = vertex3f;
1304 dpsoftrast.stride_vertex = stride;
1306 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1308 dpsoftrast.pointer_color4f = color4f;
1309 dpsoftrast.pointer_color4ub = NULL;
1310 dpsoftrast.stride_color = stride;
1312 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1314 dpsoftrast.pointer_color4f = NULL;
1315 dpsoftrast.pointer_color4ub = color4ub;
1316 dpsoftrast.stride_color = stride;
1318 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1320 dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1321 dpsoftrast.components_texcoord[unitnum] = numcomponents;
1322 dpsoftrast.stride_texcoord[unitnum] = stride;
1325 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1326 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1328 thread->shader_mode = command->mode;
1329 thread->shader_permutation = command->permutation;
1330 thread->shader_exactspecularmath = command->exactspecularmath;
1332 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1334 DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1335 command->mode = mode;
1336 command->permutation = permutation;
1337 command->exactspecularmath = exactspecularmath;
1339 dpsoftrast.shader_mode = mode;
1340 dpsoftrast.shader_permutation = permutation;
1341 dpsoftrast.shader_exactspecularmath = exactspecularmath;
1344 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1345 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1347 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1349 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1351 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1352 command->index = index;
1353 command->val[0] = v0;
1354 command->val[1] = v1;
1355 command->val[2] = v2;
1356 command->val[3] = v3;
1358 dpsoftrast.uniform4f[index*4+0] = v0;
1359 dpsoftrast.uniform4f[index*4+1] = v1;
1360 dpsoftrast.uniform4f[index*4+2] = v2;
1361 dpsoftrast.uniform4f[index*4+3] = v3;
1363 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1365 DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1366 command->index = index;
1367 memcpy(command->val, v, sizeof(command->val));
1369 memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1372 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1373 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1375 memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1377 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1381 for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1383 __m128 m0, m1, m2, m3;
1384 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1385 command->index = (DPSOFTRAST_UNIFORM)index;
1386 if (((size_t)v)&(ALIGN_SIZE-1))
1388 m0 = _mm_loadu_ps(v);
1389 m1 = _mm_loadu_ps(v+4);
1390 m2 = _mm_loadu_ps(v+8);
1391 m3 = _mm_loadu_ps(v+12);
1395 m0 = _mm_load_ps(v);
1396 m1 = _mm_load_ps(v+4);
1397 m2 = _mm_load_ps(v+8);
1398 m3 = _mm_load_ps(v+12);
1402 __m128 t0, t1, t2, t3;
1403 t0 = _mm_unpacklo_ps(m0, m1);
1404 t1 = _mm_unpacklo_ps(m2, m3);
1405 t2 = _mm_unpackhi_ps(m0, m1);
1406 t3 = _mm_unpackhi_ps(m2, m3);
1407 m0 = _mm_movelh_ps(t0, t1);
1408 m1 = _mm_movehl_ps(t1, t0);
1409 m2 = _mm_movelh_ps(t2, t3);
1410 m3 = _mm_movehl_ps(t3, t2);
1412 _mm_store_ps(command->val, m0);
1413 _mm_store_ps(command->val+4, m1);
1414 _mm_store_ps(command->val+8, m2);
1415 _mm_store_ps(command->val+12, m3);
1416 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1417 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1418 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1419 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1424 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1425 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1427 thread->uniform1i[command->index] = command->val;
1429 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1431 DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1432 command->index = index;
1435 dpsoftrast.uniform1i[command->index] = i0;
1439 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1441 float *end = dst + size*4;
1442 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1446 _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1455 _mm_store_ps(dst, _mm_load_ps((const float *)src));
1462 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1464 float *end = dst + size*4;
1465 if (stride == sizeof(float[3]))
1467 float *end4 = dst + (size&~3)*4;
1468 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1472 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv;
1473 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1474 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1475 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1476 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1477 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1478 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1479 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1480 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1481 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1482 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1483 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1484 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1486 src += 4*sizeof(float[3]);
1493 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1494 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1495 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1496 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1497 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1498 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1499 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1500 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1501 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1502 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1503 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1504 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1505 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1507 src += 4*sizeof(float[3]);
1511 if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1515 __m128 v = _mm_loadu_ps((const float *)src);
1516 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1517 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1518 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1519 _mm_store_ps(dst, v);
1528 __m128 v = _mm_load_ps((const float *)src);
1529 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1530 v = _mm_move_ss(v, _mm_set_ss(1.0f));
1531 v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1532 _mm_store_ps(dst, v);
1539 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1541 float *end = dst + size*4;
1542 __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1543 if (stride == sizeof(float[2]))
1545 float *end2 = dst + (size&~1)*4;
1546 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1550 __m128 v = _mm_loadu_ps((const float *)src);
1551 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1552 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1554 src += 2*sizeof(float[2]);
1561 __m128 v = _mm_load_ps((const float *)src);
1562 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1563 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1565 src += 2*sizeof(float[2]);
1571 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1577 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1579 float *end = dst + size*4;
1580 __m128 scale = _mm_set1_ps(1.0f/255.0f);
1581 if (stride == sizeof(unsigned char[4]))
1583 float *end4 = dst + (size&~3)*4;
1584 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1588 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1589 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1590 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1591 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1592 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1594 src += 4*sizeof(unsigned char[4]);
1601 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1602 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1603 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1604 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1605 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1607 src += 4*sizeof(unsigned char[4]);
1613 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1614 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1620 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1622 float *end = dst + 4*size;
1623 __m128 v = _mm_loadu_ps(src);
1626 _mm_store_ps(dst, v);
1632 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1635 static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1636 __m128 m0, m1, m2, m3;
1638 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1640 // fast case for identity matrix
1641 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1644 end = out4f + numitems*4;
1645 m0 = _mm_loadu_ps(inmatrix16f);
1646 m1 = _mm_loadu_ps(inmatrix16f + 4);
1647 m2 = _mm_loadu_ps(inmatrix16f + 8);
1648 m3 = _mm_loadu_ps(inmatrix16f + 12);
1649 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1653 __m128 v = _mm_loadu_ps(in4f);
1655 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1656 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1657 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1658 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1667 __m128 v = _mm_load_ps(in4f);
1669 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1670 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1671 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1672 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1680 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1682 memcpy(out4f, in4f, numitems * sizeof(float[4]));
1686 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1688 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1689 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1690 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1691 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1694 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1696 __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1697 p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1698 p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1699 out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1702 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1705 out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1706 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1707 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1708 _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1711 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1713 int clipmask = 0xFF;
1714 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1715 __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1716 __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1717 __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1718 m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1719 m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1720 m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1721 m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1722 #define BBFRONT(k, pos) \
1724 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1725 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1726 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1729 clipmask &= ~(1<<k); \
1730 proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1731 minproj = _mm_min_ss(minproj, proj); \
1732 maxproj = _mm_max_ss(maxproj, proj); \
1736 BBFRONT(1, _mm_move_ss(minpos, maxpos));
1737 BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1738 BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0)));
1739 BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1740 BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0)));
1741 BBFRONT(6, _mm_move_ss(maxpos, minpos));
1745 if (clipmask&(1<<k)) \
1747 if (!(clipmask&(1<<(k^1)))) \
1749 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1750 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1751 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1752 minproj = _mm_min_ss(minproj, proj); \
1753 maxproj = _mm_max_ss(maxproj, proj); \
1755 if (!(clipmask&(1<<(k^2)))) \
1757 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1758 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1759 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1760 minproj = _mm_min_ss(minproj, proj); \
1761 maxproj = _mm_max_ss(maxproj, proj); \
1763 if (!(clipmask&(1<<(k^4)))) \
1765 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1766 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1767 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1768 minproj = _mm_min_ss(minproj, proj); \
1769 maxproj = _mm_max_ss(maxproj, proj); \
1773 BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1774 viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1775 viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1776 minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1777 maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1778 minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1779 maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1780 *starty = _mm_cvttss_si32(maxproj);
1781 *endy = _mm_cvttss_si32(minproj)+1;
1785 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1787 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1788 float *end = out4f + numitems*4;
1789 __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1790 __m128 minpos, maxpos;
1791 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1793 minpos = maxpos = _mm_loadu_ps(in4f);
1796 __m128 v = _mm_loadu_ps(in4f);
1797 minpos = _mm_min_ps(minpos, v);
1798 maxpos = _mm_max_ps(maxpos, v);
1799 _mm_store_ps(out4f, v);
1800 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1801 _mm_store_ps(screen4f, v);
1809 minpos = maxpos = _mm_load_ps(in4f);
1812 __m128 v = _mm_load_ps(in4f);
1813 minpos = _mm_min_ps(minpos, v);
1814 maxpos = _mm_max_ps(maxpos, v);
1815 _mm_store_ps(out4f, v);
1816 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1817 _mm_store_ps(screen4f, v);
1825 ALIGN(float minposf[4]);
1826 ALIGN(float maxposf[4]);
1827 _mm_store_ps(minposf, minpos);
1828 _mm_store_ps(maxposf, maxpos);
1829 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1834 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1836 static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1837 __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1839 if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1840 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1841 end = out4f + numitems*4;
1842 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1843 viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1844 m0 = _mm_loadu_ps(inmatrix16f);
1845 m1 = _mm_loadu_ps(inmatrix16f + 4);
1846 m2 = _mm_loadu_ps(inmatrix16f + 8);
1847 m3 = _mm_loadu_ps(inmatrix16f + 12);
1848 if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1850 minpos = maxpos = _mm_loadu_ps(in4f);
1853 __m128 v = _mm_loadu_ps(in4f);
1854 minpos = _mm_min_ps(minpos, v);
1855 maxpos = _mm_max_ps(maxpos, v);
1856 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1857 _mm_store_ps(out4f, v);
1858 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1859 _mm_store_ps(screen4f, v);
1867 minpos = maxpos = _mm_load_ps(in4f);
1870 __m128 v = _mm_load_ps(in4f);
1871 minpos = _mm_min_ps(minpos, v);
1872 maxpos = _mm_max_ps(maxpos, v);
1873 DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1874 _mm_store_ps(out4f, v);
1875 DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1876 _mm_store_ps(screen4f, v);
1884 ALIGN(float minposf[4]);
1885 ALIGN(float maxposf[4]);
1886 _mm_store_ps(minposf, minpos);
1887 _mm_store_ps(maxposf, maxpos);
1888 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f);
1894 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1897 float *outf = dpsoftrast.post_array4f[outarray];
1898 const unsigned char *inb;
1899 int firstvertex = dpsoftrast.firstvertex;
1900 int numvertices = dpsoftrast.numvertices;
1904 case DPSOFTRAST_ARRAY_POSITION:
1905 stride = dpsoftrast.stride_vertex;
1906 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1907 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1909 case DPSOFTRAST_ARRAY_COLOR:
1910 stride = dpsoftrast.stride_color;
1911 if (dpsoftrast.pointer_color4f)
1913 inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1914 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1916 else if (dpsoftrast.pointer_color4ub)
1918 stride = dpsoftrast.stride_color;
1919 inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1920 DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1924 DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1928 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1929 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1931 inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1932 switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1935 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1938 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1941 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1953 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1955 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1956 DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1961 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1964 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1965 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1973 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1976 float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1977 dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1984 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1987 int startx = span->startx;
1988 int endx = span->endx;
1989 float wslope = triangle->w[0];
1990 float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1991 float endz = 1.0f / (w + wslope * startx);
1992 for (x = startx;x < endx;)
1994 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1996 if (nextsub >= endx) nextsub = endsub = endx-1;
1997 endz = 1.0f / (w + wslope * nextsub);
1998 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1999 for (; x <= endsub; x++, z += dz)
2004 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
2007 int startx = span->startx;
2008 int endx = span->endx;
2011 unsigned char * RESTRICT pixelmask = span->pixelmask;
2012 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2015 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2016 // handle alphatest now (this affects depth writes too)
2017 if (thread->alphatest)
2018 for (x = startx;x < endx;x++)
2019 if (in4f[x*4+3] < 0.5f)
2020 pixelmask[x] = false;
2021 // FIXME: this does not handle bigendian
2022 switch(thread->fb_blendmode)
2024 case DPSOFTRAST_BLENDMODE_OPAQUE:
2025 for (x = startx;x < endx;x++)
2029 d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
2030 d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
2031 d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
2032 d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
2033 pixel[x*4+0] = d[0];
2034 pixel[x*4+1] = d[1];
2035 pixel[x*4+2] = d[2];
2036 pixel[x*4+3] = d[3];
2039 case DPSOFTRAST_BLENDMODE_ALPHA:
2040 for (x = startx;x < endx;x++)
2044 a = in4f[x*4+3] * 255.0f;
2045 b = 1.0f - in4f[x*4+3];
2046 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2047 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2048 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2049 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2050 pixel[x*4+0] = d[0];
2051 pixel[x*4+1] = d[1];
2052 pixel[x*4+2] = d[2];
2053 pixel[x*4+3] = d[3];
2056 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2057 for (x = startx;x < endx;x++)
2061 a = in4f[x*4+3] * 255.0f;
2062 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2063 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2064 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2065 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2066 pixel[x*4+0] = d[0];
2067 pixel[x*4+1] = d[1];
2068 pixel[x*4+2] = d[2];
2069 pixel[x*4+3] = d[3];
2072 case DPSOFTRAST_BLENDMODE_ADD:
2073 for (x = startx;x < endx;x++)
2077 d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2078 d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2079 d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2080 d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2081 pixel[x*4+0] = d[0];
2082 pixel[x*4+1] = d[1];
2083 pixel[x*4+2] = d[2];
2084 pixel[x*4+3] = d[3];
2087 case DPSOFTRAST_BLENDMODE_INVMOD:
2088 for (x = startx;x < endx;x++)
2092 d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2093 d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2094 d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2095 d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2096 pixel[x*4+0] = d[0];
2097 pixel[x*4+1] = d[1];
2098 pixel[x*4+2] = d[2];
2099 pixel[x*4+3] = d[3];
2102 case DPSOFTRAST_BLENDMODE_MUL:
2103 for (x = startx;x < endx;x++)
2107 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2108 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2109 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2110 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2111 pixel[x*4+0] = d[0];
2112 pixel[x*4+1] = d[1];
2113 pixel[x*4+2] = d[2];
2114 pixel[x*4+3] = d[3];
2117 case DPSOFTRAST_BLENDMODE_MUL2:
2118 for (x = startx;x < endx;x++)
2122 d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2123 d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2124 d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2125 d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2126 pixel[x*4+0] = d[0];
2127 pixel[x*4+1] = d[1];
2128 pixel[x*4+2] = d[2];
2129 pixel[x*4+3] = d[3];
2132 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2133 for (x = startx;x < endx;x++)
2137 a = in4f[x*4+3] * -255.0f;
2138 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2139 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2140 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2141 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2142 pixel[x*4+0] = d[0];
2143 pixel[x*4+1] = d[1];
2144 pixel[x*4+2] = d[2];
2145 pixel[x*4+3] = d[3];
2148 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2149 for (x = startx;x < endx;x++)
2154 b = 1.0f - in4f[x*4+3];
2155 d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2156 d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2157 d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2158 d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2159 pixel[x*4+0] = d[0];
2160 pixel[x*4+1] = d[1];
2161 pixel[x*4+2] = d[2];
2162 pixel[x*4+3] = d[3];
2165 case DPSOFTRAST_BLENDMODE_INVADD:
2166 for (x = startx;x < endx;x++)
2170 d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2171 d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2172 d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2173 d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2174 pixel[x*4+0] = d[0];
2175 pixel[x*4+1] = d[1];
2176 pixel[x*4+2] = d[2];
2177 pixel[x*4+3] = d[3];
2183 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2187 int startx = span->startx;
2188 int endx = span->endx;
2189 const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2190 unsigned char * RESTRICT pixelmask = span->pixelmask;
2191 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2192 unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2195 pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2196 pixeli += span->y * dpsoftrast.fb_width + span->x;
2197 // handle alphatest now (this affects depth writes too)
2198 if (thread->alphatest)
2199 for (x = startx;x < endx;x++)
2200 if (in4ub[x*4+3] < 0.5f)
2201 pixelmask[x] = false;
2202 // FIXME: this does not handle bigendian
2203 switch(thread->fb_blendmode)
2205 case DPSOFTRAST_BLENDMODE_OPAQUE:
2206 for (x = startx;x + 4 <= endx;)
2208 if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2210 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2224 case DPSOFTRAST_BLENDMODE_ALPHA:
2225 #define FINISHBLEND(blend2, blend1) \
2226 for (x = startx;x + 1 < endx;x += 2) \
2229 switch (*(const unsigned short*)&pixelmask[x]) \
2232 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2233 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2235 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2238 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2239 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2241 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2244 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2245 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2247 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2252 for(;x < endx; x++) \
2255 if (!pixelmask[x]) \
2257 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2258 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2260 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2264 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2265 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2267 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2268 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2271 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2273 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2274 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2276 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2277 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2280 case DPSOFTRAST_BLENDMODE_ADD:
2281 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2283 case DPSOFTRAST_BLENDMODE_INVMOD:
2285 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2287 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2290 case DPSOFTRAST_BLENDMODE_MUL:
2291 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2293 case DPSOFTRAST_BLENDMODE_MUL2:
2294 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2296 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2298 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2299 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2301 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2302 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2305 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2307 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2308 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2310 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2311 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2314 case DPSOFTRAST_BLENDMODE_INVADD:
2316 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2318 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2325 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2328 int startx = span->startx;
2329 int endx = span->endx;
2334 float tc[2], endtc[2];
2336 unsigned int tci[2];
2337 unsigned int tci1[2];
2338 unsigned int tcimin[2];
2339 unsigned int tcimax[2];
2344 const unsigned char * RESTRICT pixelbase;
2345 const unsigned char * RESTRICT pixel[4];
2346 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2347 // if no texture is bound, just fill it with white
2350 for (x = startx;x < endx;x++)
2352 out4f[x*4+0] = 1.0f;
2353 out4f[x*4+1] = 1.0f;
2354 out4f[x*4+2] = 1.0f;
2355 out4f[x*4+3] = 1.0f;
2359 mip = triangle->mip[texunitindex];
2360 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2361 // if this mipmap of the texture is 1 pixel, just fill it with that color
2362 if (texture->mipmap[mip][1] == 4)
2364 c[0] = texture->bytes[2] * (1.0f/255.0f);
2365 c[1] = texture->bytes[1] * (1.0f/255.0f);
2366 c[2] = texture->bytes[0] * (1.0f/255.0f);
2367 c[3] = texture->bytes[3] * (1.0f/255.0f);
2368 for (x = startx;x < endx;x++)
2370 out4f[x*4+0] = c[0];
2371 out4f[x*4+1] = c[1];
2372 out4f[x*4+2] = c[2];
2373 out4f[x*4+3] = c[3];
2377 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2378 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2379 flags = texture->flags;
2380 tcscale[0] = texture->mipmap[mip][2];
2381 tcscale[1] = texture->mipmap[mip][3];
2382 tciwidth = texture->mipmap[mip][2];
2385 tcimax[0] = texture->mipmap[mip][2]-1;
2386 tcimax[1] = texture->mipmap[mip][3]-1;
2387 tciwrapmask[0] = texture->mipmap[mip][2]-1;
2388 tciwrapmask[1] = texture->mipmap[mip][3]-1;
2389 endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2390 endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2391 for (x = startx;x < endx;)
2393 unsigned int subtc[2];
2394 unsigned int substep[2];
2395 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2396 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2397 if (nextsub >= endx)
2399 nextsub = endsub = endx-1;
2400 if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2404 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2405 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2406 substep[0] = (endtc[0] - tc[0]) * subscale;
2407 substep[1] = (endtc[1] - tc[1]) * subscale;
2408 subtc[0] = tc[0] * (1<<16);
2409 subtc[1] = tc[1] * (1<<16);
2412 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2414 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2416 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2417 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2418 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2419 tci[0] = subtc[0]>>16;
2420 tci[1] = subtc[1]>>16;
2421 tci1[0] = tci[0] + 1;
2422 tci1[1] = tci[1] + 1;
2423 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2424 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2425 tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2426 tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2427 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2428 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2429 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2430 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2431 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2432 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2433 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2434 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2435 out4f[x*4+0] = c[0];
2436 out4f[x*4+1] = c[1];
2437 out4f[x*4+2] = c[2];
2438 out4f[x*4+3] = c[3];
2443 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2445 unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2446 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2447 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2448 tci[0] = subtc[0]>>16;
2449 tci[1] = subtc[1]>>16;
2450 tci1[0] = tci[0] + 1;
2451 tci1[1] = tci[1] + 1;
2452 tci[0] &= tciwrapmask[0];
2453 tci[1] &= tciwrapmask[1];
2454 tci1[0] &= tciwrapmask[0];
2455 tci1[1] &= tciwrapmask[1];
2456 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2457 pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2458 pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2459 pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2460 c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2461 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2462 c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2463 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2464 out4f[x*4+0] = c[0];
2465 out4f[x*4+1] = c[1];
2466 out4f[x*4+2] = c[2];
2467 out4f[x*4+3] = c[3];
2471 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2473 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2475 tci[0] = subtc[0]>>16;
2476 tci[1] = subtc[1]>>16;
2477 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2478 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2479 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2480 c[0] = pixel[0][2] * (1.0f / 255.0f);
2481 c[1] = pixel[0][1] * (1.0f / 255.0f);
2482 c[2] = pixel[0][0] * (1.0f / 255.0f);
2483 c[3] = pixel[0][3] * (1.0f / 255.0f);
2484 out4f[x*4+0] = c[0];
2485 out4f[x*4+1] = c[1];
2486 out4f[x*4+2] = c[2];
2487 out4f[x*4+3] = c[3];
2492 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2494 tci[0] = subtc[0]>>16;
2495 tci[1] = subtc[1]>>16;
2496 tci[0] &= tciwrapmask[0];
2497 tci[1] &= tciwrapmask[1];
2498 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2499 c[0] = pixel[0][2] * (1.0f / 255.0f);
2500 c[1] = pixel[0][1] * (1.0f / 255.0f);
2501 c[2] = pixel[0][0] * (1.0f / 255.0f);
2502 c[3] = pixel[0][3] * (1.0f / 255.0f);
2503 out4f[x*4+0] = c[0];
2504 out4f[x*4+1] = c[1];
2505 out4f[x*4+2] = c[2];
2506 out4f[x*4+3] = c[3];
2512 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2516 int startx = span->startx;
2517 int endx = span->endx;
2519 __m128 data, slope, tcscale;
2520 __m128i tcsize, tcmask, tcoffset, tcmax;
2522 __m128i subtc, substep, endsubtc;
2525 unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2526 const unsigned char * RESTRICT pixelbase;
2527 DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2528 // if no texture is bound, just fill it with white
2531 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2534 mip = triangle->mip[texunitindex];
2535 pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2536 // if this mipmap of the texture is 1 pixel, just fill it with that color
2537 if (texture->mipmap[mip][1] == 4)
2539 unsigned int k = *((const unsigned int *)pixelbase);
2540 for (x = startx;x < endx;x++)
2544 filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2545 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2546 flags = texture->flags;
2547 tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2548 tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2549 tcscale = _mm_cvtepi32_ps(tcsize);
2550 data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2551 slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2552 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2553 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2554 tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2555 tcmax = _mm_packs_epi32(tcmask, tcmask);
2556 for (x = startx;x < endx;)
2558 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2559 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2560 if (nextsub >= endx)
2562 nextsub = endsub = endx-1;
2563 if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2567 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2568 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2569 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2570 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2571 substep = _mm_slli_epi32(substep, 1);
2574 __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2575 if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2577 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2578 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2580 const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;
2581 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2582 tci = _mm_madd_epi16(tci, tcoffset);
2583 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2584 ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2585 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2586 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2587 pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2588 pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2589 fracm = _mm_srli_epi16(subtc, 1);
2590 pix1 = _mm_add_epi16(pix1,
2591 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2592 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2593 pix3 = _mm_add_epi16(pix3,
2594 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2595 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2596 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2597 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2598 pix2 = _mm_add_epi16(pix2,
2599 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2600 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2601 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2605 const unsigned char * RESTRICT ptr1;
2606 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2607 tci = _mm_madd_epi16(tci, tcoffset);
2608 ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2609 pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2610 pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2611 fracm = _mm_srli_epi16(subtc, 1);
2612 pix1 = _mm_add_epi16(pix1,
2613 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2614 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2615 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2616 pix1 = _mm_add_epi16(pix1,
2617 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2618 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2619 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2623 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2625 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2627 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2628 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2629 tci = _mm_madd_epi16(tci, tcoffset);
2630 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2631 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2632 _mm_setzero_si128());
2633 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2634 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2635 _mm_setzero_si128());
2636 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2637 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2638 tci = _mm_madd_epi16(tci, tcoffset);
2639 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2640 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2641 _mm_setzero_si128());
2642 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2643 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2644 _mm_setzero_si128());
2645 fracm = _mm_srli_epi16(subtc, 1);
2646 pix1 = _mm_add_epi16(pix1,
2647 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2648 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2649 pix3 = _mm_add_epi16(pix3,
2650 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2651 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2652 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2653 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2654 pix2 = _mm_add_epi16(pix2,
2655 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2656 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2657 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2661 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2662 tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2663 tci = _mm_madd_epi16(tci, tcoffset);
2664 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2665 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2666 _mm_setzero_si128());
2667 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2668 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2669 _mm_setzero_si128());
2670 fracm = _mm_srli_epi16(subtc, 1);
2671 pix1 = _mm_add_epi16(pix1,
2672 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2673 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2674 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2675 pix1 = _mm_add_epi16(pix1,
2676 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2677 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2678 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2684 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2686 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2687 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2688 tci = _mm_madd_epi16(tci, tcoffset);
2689 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2690 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2691 _mm_setzero_si128());
2692 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2693 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2694 _mm_setzero_si128());
2695 tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2696 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2697 tci = _mm_madd_epi16(tci, tcoffset);
2698 pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2699 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2700 _mm_setzero_si128());
2701 pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2702 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2703 _mm_setzero_si128());
2704 fracm = _mm_srli_epi16(subtc, 1);
2705 pix1 = _mm_add_epi16(pix1,
2706 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2707 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2708 pix3 = _mm_add_epi16(pix3,
2709 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2710 _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2711 pix2 = _mm_unpacklo_epi64(pix1, pix3);
2712 pix4 = _mm_unpackhi_epi64(pix1, pix3);
2713 pix2 = _mm_add_epi16(pix2,
2714 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2715 _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2716 _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2720 __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2721 tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2722 tci = _mm_madd_epi16(tci, tcoffset);
2723 pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2724 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2725 _mm_setzero_si128());
2726 pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2727 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2728 _mm_setzero_si128());
2729 fracm = _mm_srli_epi16(subtc, 1);
2730 pix1 = _mm_add_epi16(pix1,
2731 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2732 _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2733 pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2734 pix1 = _mm_add_epi16(pix1,
2735 _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2736 _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2737 outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2744 if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2746 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2748 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2749 tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2750 tci = _mm_madd_epi16(tci, tcoffset);
2751 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2752 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2756 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2757 tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2758 tci = _mm_madd_epi16(tci, tcoffset);
2759 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2765 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2767 __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2768 tci = _mm_and_si128(tci, tcmax);
2769 tci = _mm_madd_epi16(tci, tcoffset);
2770 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2771 outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2775 __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2776 tci = _mm_and_si128(tci, tcmax);
2777 tci = _mm_madd_epi16(tci, tcoffset);
2778 outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2787 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2790 memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2793 float DPSOFTRAST_SampleShadowmap(const float *vector)
2799 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2802 int startx = span->startx;
2803 int endx = span->endx;
2808 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2809 for (x = startx;x < endx;x++)
2812 c[0] = (data[0] + slope[0]*x) * z;
2813 c[1] = (data[1] + slope[1]*x) * z;
2814 c[2] = (data[2] + slope[2]*x) * z;
2815 c[3] = (data[3] + slope[3]*x) * z;
2816 out4f[x*4+0] = in4f[x*4+0] * c[0];
2817 out4f[x*4+1] = in4f[x*4+1] * c[1];
2818 out4f[x*4+2] = in4f[x*4+2] * c[2];
2819 out4f[x*4+3] = in4f[x*4+3] * c[3];
2823 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2826 int startx = span->startx;
2827 int endx = span->endx;
2832 DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2833 for (x = startx;x < endx;x++)
2836 c[0] = (data[0] + slope[0]*x) * z;
2837 c[1] = (data[1] + slope[1]*x) * z;
2838 c[2] = (data[2] + slope[2]*x) * z;
2839 c[3] = (data[3] + slope[3]*x) * z;
2840 out4f[x*4+0] = c[0];
2841 out4f[x*4+1] = c[1];
2842 out4f[x*4+2] = c[2];
2843 out4f[x*4+3] = c[3];
2847 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2849 int x, startx = span->startx, endx = span->endx;
2850 float c[4], localcolor[4];
2851 localcolor[0] = subcolor[0];
2852 localcolor[1] = subcolor[1];
2853 localcolor[2] = subcolor[2];
2854 localcolor[3] = subcolor[3];
2855 for (x = startx;x < endx;x++)
2857 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2858 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2859 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2860 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2861 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2862 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2863 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2864 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2868 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2870 int x, startx = span->startx, endx = span->endx;
2871 for (x = startx;x < endx;x++)
2873 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2874 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2875 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2876 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2880 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2882 int x, startx = span->startx, endx = span->endx;
2883 for (x = startx;x < endx;x++)
2885 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2886 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2887 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2888 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2892 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2894 int x, startx = span->startx, endx = span->endx;
2896 for (x = startx;x < endx;x++)
2898 a = 1.0f - inb4f[x*4+3];
2900 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2901 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2902 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2903 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2907 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2909 int x, startx = span->startx, endx = span->endx;
2910 float localcolor[4], ilerp, lerp;
2911 localcolor[0] = color[0];
2912 localcolor[1] = color[1];
2913 localcolor[2] = color[2];
2914 localcolor[3] = color[3];
2915 ilerp = 1.0f - localcolor[3];
2916 lerp = localcolor[3];
2917 for (x = startx;x < endx;x++)
2919 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2920 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2921 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2922 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2928 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2932 int startx = span->startx;
2933 int endx = span->endx;
2936 __m128i submod, substep, endsubmod;
2937 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2938 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2939 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2940 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2941 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2942 for (x = startx; x < endx;)
2944 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2945 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2946 if (nextsub >= endx)
2948 nextsub = endsub = endx-1;
2949 if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2953 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2954 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2955 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2956 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2957 substep = _mm_packs_epi32(substep, substep);
2958 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2960 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2961 pix = _mm_mulhi_epu16(pix, submod);
2962 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2966 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2967 pix = _mm_mulhi_epu16(pix, submod);
2968 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2975 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2979 int startx = span->startx;
2980 int endx = span->endx;
2983 __m128i submod, substep, endsubmod;
2984 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2985 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2986 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2987 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2988 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2989 for (x = startx; x < endx;)
2991 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2992 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2993 if (nextsub >= endx)
2995 nextsub = endsub = endx-1;
2996 if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3000 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3001 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3002 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3003 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3004 substep = _mm_packs_epi32(substep, substep);
3005 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3007 __m128i pix = _mm_srai_epi16(submod, 4);
3008 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3012 __m128i pix = _mm_srai_epi16(submod, 4);
3013 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3020 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3023 int x, startx = span->startx, endx = span->endx;
3024 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3025 localcolor = _mm_packs_epi32(localcolor, localcolor);
3026 for (x = startx;x+2 <= endx;x+=2)
3028 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3029 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3030 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3031 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3035 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3036 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3037 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3038 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3043 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3046 int x, startx = span->startx, endx = span->endx;
3047 for (x = startx;x+2 <= endx;x+=2)
3049 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3050 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3051 pix1 = _mm_mulhi_epu16(pix1, pix2);
3052 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3056 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3057 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3058 pix1 = _mm_mulhi_epu16(pix1, pix2);
3059 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3064 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3067 int x, startx = span->startx, endx = span->endx;
3068 for (x = startx;x+2 <= endx;x+=2)
3070 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3071 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3072 pix1 = _mm_add_epi16(pix1, pix2);
3073 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3077 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3078 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3079 pix1 = _mm_add_epi16(pix1, pix2);
3080 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3085 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3088 int x, startx = span->startx, endx = span->endx;
3089 __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3090 tint = _mm_packs_epi32(tint, tint);
3091 for (x = startx;x+2 <= endx;x+=2)
3093 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3094 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3095 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3096 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3100 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3101 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3102 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3103 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3108 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3111 int x, startx = span->startx, endx = span->endx;
3112 for (x = startx;x+2 <= endx;x+=2)
3114 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3115 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3116 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3117 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3118 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3122 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3123 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3124 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3125 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3126 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3131 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3134 int x, startx = span->startx, endx = span->endx;
3135 __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3136 localcolor = _mm_packs_epi32(localcolor, localcolor);
3137 blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3138 for (x = startx;x+2 <= endx;x+=2)
3140 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3141 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3142 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3146 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3147 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3148 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3155 void DPSOFTRAST_VertexShader_Generic(void)
3157 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3158 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3159 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3160 if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3161 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3164 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3166 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3167 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3168 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3169 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3170 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3171 if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3173 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3174 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3175 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3177 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3178 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3181 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3183 else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3186 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3188 else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3191 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3196 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3197 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3202 void DPSOFTRAST_VertexShader_PostProcess(void)
3204 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3205 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3206 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3209 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3211 // TODO: optimize!! at the very least there is no reason to use texture sampling on the frame texture
3212 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3213 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3214 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3215 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3216 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3217 if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3219 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3220 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3222 DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3223 if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3225 // TODO: implement saturation
3227 if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3229 // TODO: implement gammaramps
3231 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3236 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3238 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3241 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3243 // this is never called (because colormask is off when this shader is used)
3244 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3245 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3246 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3247 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3248 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3253 void DPSOFTRAST_VertexShader_FlatColor(void)
3255 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3256 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3259 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3262 unsigned char * RESTRICT pixelmask = span->pixelmask;
3263 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3264 int x, startx = span->startx, endx = span->endx;
3265 __m128i Color_Ambientm;
3266 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3267 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3268 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3269 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3270 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3271 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3272 pixel = buffer_FragColorbgra8;
3273 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3274 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3275 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3276 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3277 for (x = startx;x < endx;x++)
3280 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3283 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3284 pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3285 pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3286 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3292 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3293 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3294 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3296 if (pixel == buffer_FragColorbgra8)
3297 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3303 void DPSOFTRAST_VertexShader_VertexColor(void)
3305 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3306 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3307 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3310 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3313 unsigned char * RESTRICT pixelmask = span->pixelmask;
3314 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3315 int x, startx = span->startx, endx = span->endx;
3316 __m128i Color_Ambientm, Color_Diffusem;
3318 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3319 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3320 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3321 int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3322 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3323 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3324 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3325 pixel = buffer_FragColorbgra8;
3326 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3327 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3328 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3329 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3330 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3331 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3332 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3333 DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3334 data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3335 slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3336 data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3337 data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3338 slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3339 for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3341 __m128i color, mod, pix;
3342 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3345 __m128 z = _mm_loadu_ps(&buffer_z[x]);
3346 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3347 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3348 data = _mm_add_ps(data, slope);
3349 mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3350 data = _mm_add_ps(data, slope);
3351 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3352 data = _mm_add_ps(data, slope);
3353 mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3354 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3355 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3356 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3357 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3358 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3364 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3365 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x])));
3366 mod = _mm_packs_epi32(mod, mod);
3367 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3368 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3370 if (pixel == buffer_FragColorbgra8)
3371 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3377 void DPSOFTRAST_VertexShader_Lightmap(void)
3379 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3380 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3381 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3384 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3387 unsigned char * RESTRICT pixelmask = span->pixelmask;
3388 unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3389 int x, startx = span->startx, endx = span->endx;
3390 __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3391 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3392 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3393 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3394 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3395 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3396 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3397 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3398 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3399 if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3400 pixel = buffer_FragColorbgra8;
3401 Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3402 Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3403 Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3404 Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3405 Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3406 Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3407 Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3408 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3410 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3411 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3412 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3413 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3414 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3415 for (x = startx;x < endx;x++)
3417 __m128i color, lightmap, glow, pix;
3418 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3421 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3422 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3423 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3424 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3425 _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3426 _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3427 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3428 _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3429 _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3430 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3436 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3437 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3438 glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3439 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3440 pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3441 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3446 for (x = startx;x < endx;x++)
3448 __m128i color, lightmap, pix;
3449 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3452 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3453 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3454 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3455 _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3456 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3457 _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3458 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3464 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3465 lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3466 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3467 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3470 if (pixel == buffer_FragColorbgra8)
3471 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3476 void DPSOFTRAST_VertexShader_LightDirection(void);
3477 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3479 void DPSOFTRAST_VertexShader_FakeLight(void)
3481 DPSOFTRAST_VertexShader_LightDirection();
3484 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3486 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3491 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3493 DPSOFTRAST_VertexShader_LightDirection();
3494 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3497 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3499 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3504 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3506 DPSOFTRAST_VertexShader_LightDirection();
3507 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3510 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3512 DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3517 void DPSOFTRAST_VertexShader_LightDirection(void)
3520 int numvertices = dpsoftrast.numvertices;
3522 float LightVector[4];
3523 float EyePosition[4];
3524 float EyeVectorModelSpace[4];
3530 LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3531 LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3532 LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3533 LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3534 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3535 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3536 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3537 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3538 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3539 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3540 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3541 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3542 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3543 for (i = 0;i < numvertices;i++)
3545 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3546 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3547 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3548 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3549 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3550 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3551 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3552 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3553 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3554 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3555 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3556 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3557 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3558 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3559 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3560 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3561 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3562 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3563 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3564 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3565 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3566 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3567 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3568 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3569 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
3570 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3571 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3572 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3573 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3575 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3578 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3579 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3580 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3581 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3582 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3583 #define DPSOFTRAST_Vector3Normalize(v)\
3586 float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3597 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3599 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3600 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3601 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3602 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3603 unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3604 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3605 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3606 unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3607 unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3608 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3609 int x, startx = span->startx, endx = span->endx;
3610 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3611 float LightVectordata[4];
3612 float LightVectorslope[4];
3613 float EyeVectordata[4];
3614 float EyeVectorslope[4];
3615 float VectorSdata[4];
3616 float VectorSslope[4];
3617 float VectorTdata[4];
3618 float VectorTslope[4];
3619 float VectorRdata[4];
3620 float VectorRslope[4];
3622 float diffusetex[4];
3624 float surfacenormal[4];
3625 float lightnormal[4];
3626 float lightnormal_modelspace[4];
3628 float specularnormal[4];
3631 float SpecularPower;
3633 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3634 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3635 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3636 Color_Glow[3] = 0.0f;
3637 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3638 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3639 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3640 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3641 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3642 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3643 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3644 Color_Pants[3] = 0.0f;
3645 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3646 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3647 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3648 Color_Shirt[3] = 0.0f;
3649 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3650 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3651 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3653 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3654 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3656 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3658 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3660 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3662 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3663 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3664 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3665 Color_Diffuse[3] = 0.0f;
3666 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3667 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3668 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3669 LightColor[3] = 0.0f;
3670 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3671 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3672 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3673 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3674 Color_Specular[3] = 0.0f;
3675 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3676 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3677 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3679 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3681 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3682 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3683 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3684 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3685 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3687 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3689 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3690 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3692 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3694 // nothing of this needed
3698 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3701 for (x = startx;x < endx;x++)
3704 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3705 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3706 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3707 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3708 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3710 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3711 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3712 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3713 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3715 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3716 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3717 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3718 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3719 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3720 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3721 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3722 DPSOFTRAST_Vector3Normalize(surfacenormal);
3724 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3726 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3727 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3728 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3729 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3731 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3732 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3733 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3734 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3736 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3737 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3738 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3739 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3741 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3742 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3743 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3744 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3746 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3747 DPSOFTRAST_Vector3Normalize(lightnormal);
3749 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3751 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3752 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3753 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3754 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3757 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3759 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3760 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3761 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3763 float f = 1.0f / 256.0f;
3764 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3765 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3766 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3769 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3771 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3772 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3773 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3774 DPSOFTRAST_Vector3Normalize(lightnormal);
3776 LightColor[0] = 1.0;
3777 LightColor[1] = 1.0;
3778 LightColor[2] = 1.0;
3782 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3783 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3784 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3785 DPSOFTRAST_Vector3Normalize(lightnormal);
3788 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3790 if(thread->shader_exactspecularmath)
3792 // reflect lightnormal at surfacenormal, take the negative of that
3793 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3795 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3796 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3797 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3798 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3800 // dot of this and normalize(EyeVectorFogDepth.xyz)
3801 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3802 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3803 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3804 DPSOFTRAST_Vector3Normalize(eyenormal);
3806 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3810 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3811 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3812 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3813 DPSOFTRAST_Vector3Normalize(eyenormal);
3815 specularnormal[0] = lightnormal[0] + eyenormal[0];
3816 specularnormal[1] = lightnormal[1] + eyenormal[1];
3817 specularnormal[2] = lightnormal[2] + eyenormal[2];
3818 DPSOFTRAST_Vector3Normalize(specularnormal);
3820 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3823 specular = pow(specular, SpecularPower * glosstex[3]);
3824 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3826 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3827 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3828 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3829 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3833 d[0] = (int)( diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3834 d[1] = (int)( diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3835 d[2] = (int)( diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3836 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3839 buffer_FragColorbgra8[x*4+0] = d[0];
3840 buffer_FragColorbgra8[x*4+1] = d[1];
3841 buffer_FragColorbgra8[x*4+2] = d[2];
3842 buffer_FragColorbgra8[x*4+3] = d[3];
3845 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3847 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3848 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3849 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3850 Color_Diffuse[3] = 0.0f;
3851 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3852 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3853 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3854 LightColor[3] = 0.0f;
3855 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3857 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3859 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3860 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3861 DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3862 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3863 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3865 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3867 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3868 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3870 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3872 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3876 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3879 for (x = startx;x < endx;x++)
3882 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3883 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3884 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3885 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3886 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3887 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3888 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3889 DPSOFTRAST_Vector3Normalize(surfacenormal);
3891 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3893 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3894 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3895 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3896 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3898 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3899 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3900 + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3901 + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3903 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3904 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3905 + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3906 + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3908 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3909 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3910 + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3911 + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3913 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3914 DPSOFTRAST_Vector3Normalize(lightnormal);
3916 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3918 float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3919 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3920 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3921 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3924 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3926 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3927 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3928 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3930 float f = 1.0f / 256.0f;
3931 LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3932 LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3933 LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3936 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3938 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3939 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3940 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3941 DPSOFTRAST_Vector3Normalize(lightnormal);
3943 LightColor[0] = 1.0;
3944 LightColor[1] = 1.0;
3945 LightColor[2] = 1.0;
3949 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3950 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3951 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3952 DPSOFTRAST_Vector3Normalize(lightnormal);
3955 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3956 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3958 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3959 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3960 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3961 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3965 d[0] = (int)( + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3966 d[1] = (int)( + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3967 d[2] = (int)( + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3968 d[3] = (int)( diffusetex[3] * (Color_Ambient[3] ));if (d[3] > 255) d[3] = 255;
3970 buffer_FragColorbgra8[x*4+0] = d[0];
3971 buffer_FragColorbgra8[x*4+1] = d[1];
3972 buffer_FragColorbgra8[x*4+2] = d[2];
3973 buffer_FragColorbgra8[x*4+3] = d[3];
3978 for (x = startx;x < endx;x++)
3981 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3982 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3983 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3984 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3986 if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3988 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3989 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3990 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3991 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3995 d[0] = (int)( diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3996 d[1] = (int)( diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3997 d[2] = (int)( diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3998 d[3] = (int)( diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4000 buffer_FragColorbgra8[x*4+0] = d[0];
4001 buffer_FragColorbgra8[x*4+1] = d[1];
4002 buffer_FragColorbgra8[x*4+2] = d[2];
4003 buffer_FragColorbgra8[x*4+3] = d[3];
4006 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4011 void DPSOFTRAST_VertexShader_LightSource(void)
4014 int numvertices = dpsoftrast.numvertices;
4015 float LightPosition[4];
4016 float LightVector[4];
4017 float LightVectorModelSpace[4];
4018 float EyePosition[4];
4019 float EyeVectorModelSpace[4];
4025 LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4026 LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4027 LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4028 LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4029 EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4030 EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4031 EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4032 EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4033 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4034 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4035 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4036 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4037 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4038 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4039 for (i = 0;i < numvertices;i++)
4041 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4042 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4043 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4044 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4045 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4046 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4047 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4048 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4049 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4050 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4051 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4052 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4053 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4054 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4055 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4056 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4057 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4058 LightVector[2] = normal[0] * LightVectorModelSpace[0] + normal[1] * LightVectorModelSpace[1] + normal[2] * LightVectorModelSpace[2];
4059 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4060 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4061 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4062 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4063 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4064 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4065 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4066 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4067 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4068 EyeVector[2] = normal[0] * EyeVectorModelSpace[0] + normal[1] * EyeVectorModelSpace[1] + normal[2] * EyeVectorModelSpace[2];
4069 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4070 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4071 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4072 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4074 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4075 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4078 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4081 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4082 unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4083 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4084 unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4085 unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4086 unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4087 unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4088 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4089 int x, startx = span->startx, endx = span->endx;
4090 float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4091 float CubeVectordata[4];
4092 float CubeVectorslope[4];
4093 float LightVectordata[4];
4094 float LightVectorslope[4];
4095 float EyeVectordata[4];
4096 float EyeVectorslope[4];
4098 float diffusetex[4];
4100 float surfacenormal[4];
4101 float lightnormal[4];
4103 float specularnormal[4];
4106 float SpecularPower;
4107 float CubeVector[4];
4110 Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4111 Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4112 Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4113 Color_Glow[3] = 0.0f;
4114 Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4115 Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4116 Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4117 Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4118 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4119 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4120 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4121 Color_Diffuse[3] = 0.0f;
4122 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4123 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4124 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4125 Color_Specular[3] = 0.0f;
4126 Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4127 Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4128 Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4129 Color_Pants[3] = 0.0f;
4130 Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4131 Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4132 Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4133 Color_Shirt[3] = 0.0f;
4134 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4135 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4136 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4137 LightColor[3] = 0.0f;
4138 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4139 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4140 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4141 DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4142 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4143 memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4144 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4145 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4147 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4148 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4150 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4151 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4152 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4154 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4155 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4156 for (x = startx;x < endx;x++)
4159 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4160 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4161 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4162 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4163 if (attenuation < 0.01f)
4165 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4167 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4168 if (attenuation < 0.01f)
4172 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4173 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4174 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4175 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4176 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4178 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4179 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4180 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4181 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4183 glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4184 glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4185 glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4186 glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4187 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4188 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4189 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4190 DPSOFTRAST_Vector3Normalize(surfacenormal);
4192 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4193 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4194 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4195 DPSOFTRAST_Vector3Normalize(lightnormal);
4197 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4199 if(thread->shader_exactspecularmath)
4201 // reflect lightnormal at surfacenormal, take the negative of that
4202 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4204 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4205 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4206 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4207 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4209 // dot of this and normalize(EyeVectorFogDepth.xyz)
4210 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4211 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4212 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4213 DPSOFTRAST_Vector3Normalize(eyenormal);
4215 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4219 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4220 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4221 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4222 DPSOFTRAST_Vector3Normalize(eyenormal);
4224 specularnormal[0] = lightnormal[0] + eyenormal[0];
4225 specularnormal[1] = lightnormal[1] + eyenormal[1];
4226 specularnormal[2] = lightnormal[2] + eyenormal[2];
4227 DPSOFTRAST_Vector3Normalize(specularnormal);
4229 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4231 specular = pow(specular, SpecularPower * glosstex[3]);
4233 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4235 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4236 attenuation *= (1.0f / 255.0f);
4237 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4238 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4239 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4240 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4244 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4245 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4246 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4247 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4249 buffer_FragColorbgra8[x*4+0] = d[0];
4250 buffer_FragColorbgra8[x*4+1] = d[1];
4251 buffer_FragColorbgra8[x*4+2] = d[2];
4252 buffer_FragColorbgra8[x*4+3] = d[3];
4255 else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4257 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4258 for (x = startx;x < endx;x++)
4261 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4262 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4263 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4264 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4265 if (attenuation < 0.01f)
4267 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4269 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4270 if (attenuation < 0.01f)
4274 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4275 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4276 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4277 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4278 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4280 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4281 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4282 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4283 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4285 surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4286 surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4287 surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4288 DPSOFTRAST_Vector3Normalize(surfacenormal);
4290 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4291 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4292 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4293 DPSOFTRAST_Vector3Normalize(lightnormal);
4295 diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4296 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4298 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4299 attenuation *= (1.0f / 255.0f);
4300 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4301 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4302 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4303 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4307 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4308 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4309 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4310 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4312 buffer_FragColorbgra8[x*4+0] = d[0];
4313 buffer_FragColorbgra8[x*4+1] = d[1];
4314 buffer_FragColorbgra8[x*4+2] = d[2];
4315 buffer_FragColorbgra8[x*4+3] = d[3];
4320 for (x = startx;x < endx;x++)
4323 CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4324 CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4325 CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4326 attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4327 if (attenuation < 0.01f)
4329 if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4331 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4332 if (attenuation < 0.01f)
4336 diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4337 diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4338 diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4339 diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4340 if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4342 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4343 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4344 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4345 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4347 if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4349 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4350 attenuation *= (1.0f / 255.0f);
4351 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4352 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4353 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4354 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4358 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * attenuation);if (d[0] > 255) d[0] = 255;
4359 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * attenuation);if (d[1] > 255) d[1] = 255;
4360 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * attenuation);if (d[2] > 255) d[2] = 255;
4361 d[3] = (int)( diffusetex[3] );if (d[3] > 255) d[3] = 255;
4363 buffer_FragColorbgra8[x*4+0] = d[0];
4364 buffer_FragColorbgra8[x*4+1] = d[1];
4365 buffer_FragColorbgra8[x*4+2] = d[2];
4366 buffer_FragColorbgra8[x*4+3] = d[3];
4369 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4375 void DPSOFTRAST_VertexShader_Refraction(void)
4377 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4378 DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4379 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4382 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4384 // DIRTY TRICK: only do sideways displacement. Not correct, but cheaper and thus better for SW.
4386 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4388 int x, startx = span->startx, endx = span->endx;
4391 unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4392 //unsigned char buffer_texture_refractionbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4393 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4396 float ModelViewProjectionPositiondata[4];
4397 float ModelViewProjectionPositionslope[4];
4400 float ScreenScaleRefractReflect[2];
4401 float ScreenCenterRefractReflect[2];
4402 float DistortScaleRefractReflect[2];
4403 float RefractColor[4];
4405 const unsigned char * RESTRICT pixelbase;
4406 const unsigned char * RESTRICT pixel[4];
4407 DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4408 if(!texture) return;
4409 pixelbase = (unsigned char *)texture->bytes + texture->mipmap[0][0];
4412 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4413 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4414 //DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_refractionbgra8, GL20TU_REFRACTION, DPSOFTRAST_ARRAY_TEXCOORD1, buffer_z);
4417 DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
4420 ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4421 ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4422 ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4423 ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4424 DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4425 DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4426 RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4427 RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4428 RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4429 RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4432 for (x = startx;x < endx;x++)
4434 float SafeScreenTexCoord[2];
4435 float ScreenTexCoord[2];
4442 // " vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4443 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4445 // " vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4446 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4447 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4449 // " vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4450 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4451 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4452 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4453 DPSOFTRAST_Vector3Normalize(v);
4454 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4455 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4457 // " dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4458 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4460 unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<16) - 32768, ScreenTexCoord[1] * (texture->mipmap[0][3]<<16) - 32678};
4461 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
4462 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
4463 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
4464 int tci[2] = { tc[0]>>16, tc[1]>>16 };
4465 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4466 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4467 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4468 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4469 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4470 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4471 pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci1[0]);
4472 pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci[0]);
4473 pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci1[0]);
4474 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
4475 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
4476 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
4480 int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2] - 0.5, ScreenTexCoord[1] * texture->mipmap[0][3] - 0.5 };
4481 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4482 tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4483 tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4484 tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4485 tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4486 pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4492 //p = (int) bound(startx, x + (ScreenTexCoord[0] - SafeScreenTexCoord[0]) / (ModelViewProjectionPositionslope[0]*z), endx-1);
4493 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4494 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4495 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4496 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4499 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4504 void DPSOFTRAST_VertexShader_Water(void)
4506 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4510 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4513 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4514 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4515 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4516 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4517 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4522 void DPSOFTRAST_VertexShader_ShowDepth(void)
4524 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4527 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4530 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4531 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4532 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4533 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4534 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4539 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4541 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4544 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4547 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4548 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4549 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4550 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4551 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4556 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4558 DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4561 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4564 float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4565 unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4566 DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4567 memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4568 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4573 typedef struct DPSOFTRAST_ShaderModeInfo_s
4576 void (*Vertex)(void);
4577 void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4578 unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4579 unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4581 DPSOFTRAST_ShaderModeInfo;
4583 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4585 {2, DPSOFTRAST_VertexShader_Generic, DPSOFTRAST_PixelShader_Generic, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4586 {2, DPSOFTRAST_VertexShader_PostProcess, DPSOFTRAST_PixelShader_PostProcess, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4587 {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow, DPSOFTRAST_PixelShader_Depth_Or_Shadow, {~0}, {~0}},
4588 {2, DPSOFTRAST_VertexShader_FlatColor, DPSOFTRAST_PixelShader_FlatColor, {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4589 {2, DPSOFTRAST_VertexShader_VertexColor, DPSOFTRAST_PixelShader_VertexColor, {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4590 {2, DPSOFTRAST_VertexShader_Lightmap, DPSOFTRAST_PixelShader_Lightmap, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4591 {2, DPSOFTRAST_VertexShader_FakeLight, DPSOFTRAST_PixelShader_FakeLight, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4592 {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace, DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4593 {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4594 {2, DPSOFTRAST_VertexShader_LightDirection, DPSOFTRAST_PixelShader_LightDirection, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4595 {2, DPSOFTRAST_VertexShader_LightSource, DPSOFTRAST_PixelShader_LightSource, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4596 {2, DPSOFTRAST_VertexShader_Refraction, DPSOFTRAST_PixelShader_Refraction, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4597 {2, DPSOFTRAST_VertexShader_Water, DPSOFTRAST_PixelShader_Water, {~0}},
4598 {2, DPSOFTRAST_VertexShader_ShowDepth, DPSOFTRAST_PixelShader_ShowDepth, {~0}},
4599 {2, DPSOFTRAST_VertexShader_DeferredGeometry, DPSOFTRAST_PixelShader_DeferredGeometry, {~0}},
4600 {2, DPSOFTRAST_VertexShader_DeferredLightSource, DPSOFTRAST_PixelShader_DeferredLightSource, {~0}},
4603 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4610 // unsigned int *colorpixel;
4611 unsigned int *depthpixel;
4617 DPSOFTRAST_State_Triangle *triangle;
4618 DPSOFTRAST_State_Span *span;
4619 unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4620 for (i = 0; i < thread->numspans; i++)
4622 span = &thread->spans[i];
4623 triangle = &thread->triangles[span->triangle];
4624 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4626 wslope = triangle->w[0];
4627 w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4628 depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4629 depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4630 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4631 startx = span->startx;
4633 switch(thread->fb_depthfunc)
4636 case GL_ALWAYS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4637 case GL_LESS: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4638 case GL_LEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4639 case GL_EQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4640 case GL_GEQUAL: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4641 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4642 case GL_NEVER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4644 //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4645 //for (x = startx;x < endx;x++)
4646 // colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4647 // if there is no color buffer, skip pixel shader
4648 while (startx < endx && !pixelmask[startx])
4650 while (endx > startx && !pixelmask[endx-1])
4653 continue; // no pixels to fill
4654 span->pixelmask = pixelmask;
4655 span->startx = startx;
4657 // run pixel shader if appropriate
4658 // do this before running depthmask code, to allow the pixelshader
4659 // to clear pixelmask values for alpha testing
4660 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4661 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4662 if (thread->depthmask)
4663 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4669 // no depth testing means we're just dealing with color...
4670 // if there is no color buffer, skip pixel shader
4671 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4673 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4674 span->pixelmask = pixelmask;
4675 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4679 thread->numspans = 0;
4682 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4684 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4687 int cullface = thread->cullface;
4688 int minx, maxx, miny, maxy;
4689 int miny1, maxy1, miny2, maxy2;
4690 __m128i fbmin, fbmax;
4691 __m128 viewportcenter, viewportscale;
4692 int firstvertex = command->firstvertex;
4693 int numvertices = command->numvertices;
4694 int numtriangles = command->numtriangles;
4695 const int *element3i = command->element3i;
4696 const unsigned short *element3s = command->element3s;
4697 int clipped = command->clipped;
4704 int starty, endy, bandy;
4708 __m128 triangleedge1, triangleedge2, trianglenormal;
4711 DPSOFTRAST_State_Triangle *triangle;
4712 DPSOFTRAST_Texture *texture;
4713 DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4714 miny = thread->fb_scissor[1];
4715 maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4716 miny1 = bound(miny, thread->miny1, maxy);
4717 maxy1 = bound(miny, thread->maxy1, maxy);
4718 miny2 = bound(miny, thread->miny2, maxy);
4719 maxy2 = bound(miny, thread->maxy2, maxy);
4720 if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4722 if (!ATOMIC_DECREMENT(command->refcount))
4724 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4725 MM_FREE(command->arrays);
4729 minx = thread->fb_scissor[0];
4730 maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4731 fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4732 fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4733 viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4734 viewportscale = _mm_load_ps(thread->fb_viewportscale);
4735 screen[3] = _mm_setzero_ps();
4736 clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4737 for (i = 0;i < numtriangles;i++)
4739 const float *screencoord4f = command->arrays;
4740 const float *arrays = screencoord4f + numvertices*4;
4742 // generate the 3 edges of this triangle
4743 // generate spans for the triangle - switch based on left split or right split classification of triangle
4746 e[0] = element3s[i*3+0] - firstvertex;
4747 e[1] = element3s[i*3+1] - firstvertex;
4748 e[2] = element3s[i*3+2] - firstvertex;
4752 e[0] = element3i[i*3+0] - firstvertex;
4753 e[1] = element3i[i*3+1] - firstvertex;
4754 e[2] = element3i[i*3+2] - firstvertex;
4763 #define SKIPBACKFACE \
4764 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4765 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4766 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4767 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4768 _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4772 if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4776 if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4781 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4782 clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4784 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4785 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4787 #define CLIPPEDVERTEXCOPY(k,p1) \
4788 screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4790 #define GENATTRIBCOPY(attrib, p1) \
4791 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4792 #define GENATTRIBLERP(attrib, p1, p2) \
4794 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4795 attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4797 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4801 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4802 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4803 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4804 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4805 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4806 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4807 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4813 // calculate distance from nearplane
4814 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4815 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4816 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4817 if (clipdist[0] >= 0.0f)
4819 if (clipdist[1] >= 0.0f)
4821 if (clipdist[2] >= 0.0f)
4824 // triangle is entirely in front of nearplane
4825 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4832 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4840 if (clipdist[2] >= 0.0f)
4842 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4849 CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4856 else if (clipdist[1] >= 0.0f)
4858 if (clipdist[2] >= 0.0f)
4860 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4867 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4873 else if (clipdist[2] >= 0.0f)
4875 CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4880 else continue; // triangle is entirely behind nearplane
4883 // calculate integer y coords for triangle points
4884 __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4885 screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4886 screenmin = _mm_min_epi16(screeni, screenir),
4887 screenmax = _mm_max_epi16(screeni, screenir);
4888 screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4889 screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4890 screenmin = _mm_max_epi16(screenmin, fbmin);
4891 screenmax = _mm_min_epi16(screenmax, fbmax);
4892 // skip offscreen triangles
4893 if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4895 starty = _mm_extract_epi16(screenmin, 1);
4896 endy = _mm_extract_epi16(screenmax, 1)+1;
4897 if (starty >= maxy1 && endy <= miny2)
4899 screeny = _mm_srai_epi32(screeni, 16);
4902 triangle = &thread->triangles[thread->numtriangles];
4904 // calculate attribute plans for triangle data...
4905 // okay, this triangle is going to produce spans, we'd better project
4906 // the interpolants now (this is what gives perspective texturing),
4907 // this consists of simply multiplying all arrays by the W coord
4908 // (which is basically 1/Z), which will be undone per-pixel
4909 // (multiplying by Z again) to get the perspective-correct array
4912 __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4913 __m128 mipedgescale, mipdensity;
4914 attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4915 attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4916 attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4917 attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4918 attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4919 w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4920 w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4921 w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4922 attribedge1 = _mm_sub_ss(w0, w1);
4923 attribedge2 = _mm_sub_ss(w2, w1);
4924 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4925 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4926 x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4927 y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4928 attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4929 _mm_store_ss(&triangle->w[0], attribxslope);
4930 _mm_store_ss(&triangle->w[1], attribyslope);
4931 _mm_store_ss(&triangle->w[2], attriborigin);
4932 mipedgescale = _mm_setzero_ps();
4933 for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4935 __m128 attrib0, attrib1, attrib2;
4936 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4937 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4939 arrays += numvertices*4;
4940 GENATTRIBS(attrib0, attrib1, attrib2);
4941 attriborigin = _mm_mul_ps(attrib1, w1);
4942 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4943 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4944 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4945 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4946 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4947 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
4948 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
4949 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
4950 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4952 mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4953 mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4954 mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4955 mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4959 memset(triangle->mip, 0, sizeof(triangle->mip));
4960 for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4962 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4963 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4965 texture = thread->texbound[texunit];
4966 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4968 mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4969 mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4970 mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4971 mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4972 // this will be multiplied in the texturing routine by the texture resolution
4973 y = _mm_cvtss_si32(mipdensity);
4976 y = (int)(log((float)y)*0.5f/M_LN2);
4977 if (y > texture->mipmaps - 1)
4978 y = texture->mipmaps - 1;
4979 triangle->mip[texunit] = y;
4985 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4988 __m128 xcoords, xslope;
4989 __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4990 int yccmask = _mm_movemask_epi8(ycc);
4991 int edge0p, edge0n, edge1p, edge1n;
4998 case 0xFFFF: /*0000*/ y = endy; continue;
4999 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5000 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5001 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5002 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5003 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5004 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5005 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5006 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5007 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5008 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5009 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5010 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5011 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5012 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5013 case 0x0000: /*1111*/ y++; continue;
5021 case 0xFFFF: /*000*/ y = endy; continue;
5022 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5023 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5024 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5025 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5026 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5027 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5028 case 0x0000: /*111*/ y++; continue;
5031 ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5032 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5033 ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5034 nexty = _mm_extract_epi16(ycc, 0);
5035 if (nexty >= bandy) nexty = bandy-1;
5036 xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5037 xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5038 xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5039 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5040 xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5041 if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5043 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5044 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5046 for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
5048 int startx, endx, offset;
5049 startx = _mm_cvtss_si32(xcoords);
5050 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5053 if (startx < 0) startx = 0;
5054 startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
5056 if (endx > maxx) endx = maxx;
5057 if (startx >= endx) continue;
5058 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5060 DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5061 span->triangle = thread->numtriangles;
5064 span->startx = max(minx - offset, 0);
5065 span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5066 if (span->startx >= span->endx)
5068 if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5069 DPSOFTRAST_Draw_ProcessSpans(thread);
5074 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5076 DPSOFTRAST_Draw_ProcessSpans(thread);
5077 thread->numtriangles = 0;
5081 if (!ATOMIC_DECREMENT(command->refcount))
5083 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5084 MM_FREE(command->arrays);
5087 if (thread->numspans > 0 || thread->numtriangles > 0)
5089 DPSOFTRAST_Draw_ProcessSpans(thread);
5090 thread->numtriangles = 0;
5095 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5099 int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5100 int datasize = 2*numvertices*sizeof(float[4]);
5101 DPSOFTRAST_Command_Draw *command;
5102 unsigned char *data;
5103 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5105 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5106 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5108 datasize += numvertices*sizeof(float[4]);
5111 datasize += numtriangles*sizeof(unsigned short[3]);
5113 datasize += numtriangles*sizeof(int[3]);
5114 datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5115 if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5117 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5118 data = (unsigned char *)MM_CALLOC(datasize, 1);
5122 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5123 data = (unsigned char *)command + commandsize;
5125 command->firstvertex = firstvertex;
5126 command->numvertices = numvertices;
5127 command->numtriangles = numtriangles;
5128 command->arrays = (float *)data;
5129 memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5130 dpsoftrast.firstvertex = firstvertex;
5131 dpsoftrast.numvertices = numvertices;
5132 dpsoftrast.screencoord4f = (float *)data;
5133 data += numvertices*sizeof(float[4]);
5134 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5135 data += numvertices*sizeof(float[4]);
5136 for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5138 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5139 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5141 dpsoftrast.post_array4f[j] = (float *)data;
5142 data += numvertices*sizeof(float[4]);
5144 command->element3i = NULL;
5145 command->element3s = NULL;
5148 command->element3s = (unsigned short *)data;
5149 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5153 command->element3i = (int *)data;
5154 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5159 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5161 DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5162 DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5163 command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5164 command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5165 if (command->starty >= command->endy)
5167 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5168 MM_FREE(command->arrays);
5169 DPSOFTRAST_UndoCommand(command->commandsize);
5172 command->clipped = dpsoftrast.drawclipped;
5173 command->refcount = dpsoftrast.numthreads;
5175 if (dpsoftrast.usethreads)
5178 DPSOFTRAST_Draw_SyncCommands();
5179 for (i = 0; i < dpsoftrast.numthreads; i++)
5181 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5182 if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5183 Thread_CondSignal(thread->drawcond);
5188 DPSOFTRAST_Draw_FlushThreads();
5192 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5193 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5195 thread->validate |= DPSOFTRAST_VALIDATE_FB;
5197 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5199 DPSOFTRAST_Command_SetRenderTargets *command;
5200 if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5201 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5202 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5204 dpsoftrast.fb_width = width;
5205 dpsoftrast.fb_height = height;
5206 dpsoftrast.fb_depthpixels = depthpixels;
5207 dpsoftrast.fb_colorpixels[0] = colorpixels0;
5208 dpsoftrast.fb_colorpixels[1] = colorpixels1;
5209 dpsoftrast.fb_colorpixels[2] = colorpixels2;
5210 dpsoftrast.fb_colorpixels[3] = colorpixels3;
5211 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5212 command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5213 command->width = width;
5214 command->height = height;
5217 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5219 int commandoffset = thread->commandoffset;
5220 while (commandoffset != endoffset)
5222 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5223 switch (command->opcode)
5225 #define INTERPCOMMAND(name) \
5226 case DPSOFTRAST_OPCODE_##name : \
5227 DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5228 commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5229 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5230 commandoffset = 0; \
5232 INTERPCOMMAND(Viewport)
5233 INTERPCOMMAND(ClearColor)
5234 INTERPCOMMAND(ClearDepth)
5235 INTERPCOMMAND(ColorMask)
5236 INTERPCOMMAND(DepthTest)
5237 INTERPCOMMAND(ScissorTest)
5238 INTERPCOMMAND(Scissor)
5239 INTERPCOMMAND(BlendFunc)
5240 INTERPCOMMAND(BlendSubtract)
5241 INTERPCOMMAND(DepthMask)
5242 INTERPCOMMAND(DepthFunc)
5243 INTERPCOMMAND(DepthRange)
5244 INTERPCOMMAND(PolygonOffset)
5245 INTERPCOMMAND(CullFace)
5246 INTERPCOMMAND(AlphaTest)
5247 INTERPCOMMAND(AlphaFunc)
5248 INTERPCOMMAND(SetTexture)
5249 INTERPCOMMAND(SetShader)
5250 INTERPCOMMAND(Uniform4f)
5251 INTERPCOMMAND(UniformMatrix4f)
5252 INTERPCOMMAND(Uniform1i)
5253 INTERPCOMMAND(SetRenderTargets)
5255 case DPSOFTRAST_OPCODE_Draw:
5256 DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5257 commandoffset += command->commandsize;
5258 if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5260 thread->commandoffset = commandoffset;
5263 case DPSOFTRAST_OPCODE_Reset:
5268 thread->commandoffset = commandoffset;
5271 static int DPSOFTRAST_Draw_Thread(void *data)
5273 DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5274 while(thread->index >= 0)
5276 if (thread->commandoffset != dpsoftrast.drawcommand)
5278 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5282 Thread_LockMutex(thread->drawmutex);
5283 if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5285 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5286 thread->starving = true;
5287 Thread_CondWait(thread->drawcond, thread->drawmutex);
5288 thread->starving = false;
5290 Thread_UnlockMutex(thread->drawmutex);
5296 static void DPSOFTRAST_Draw_FlushThreads(void)
5298 DPSOFTRAST_State_Thread *thread;
5300 DPSOFTRAST_Draw_SyncCommands();
5301 if (dpsoftrast.usethreads)
5303 for (i = 0; i < dpsoftrast.numthreads; i++)
5305 thread = &dpsoftrast.threads[i];
5306 if (thread->commandoffset != dpsoftrast.drawcommand)
5308 Thread_LockMutex(thread->drawmutex);
5309 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5310 Thread_CondSignal(thread->drawcond);
5311 Thread_UnlockMutex(thread->drawmutex);
5314 for (i = 0; i < dpsoftrast.numthreads; i++)
5316 thread = &dpsoftrast.threads[i];
5317 if (thread->commandoffset != dpsoftrast.drawcommand)
5319 Thread_LockMutex(thread->drawmutex);
5320 if (thread->commandoffset != dpsoftrast.drawcommand)
5322 thread->waiting = true;
5323 Thread_CondWait(thread->waitcond, thread->drawmutex);
5324 thread->waiting = false;
5326 Thread_UnlockMutex(thread->drawmutex);
5332 for (i = 0; i < dpsoftrast.numthreads; i++)
5334 thread = &dpsoftrast.threads[i];
5335 if (thread->commandoffset != dpsoftrast.drawcommand)
5336 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5339 dpsoftrast.commandpool.usedcommands = 0;
5342 void DPSOFTRAST_Flush(void)
5344 DPSOFTRAST_Draw_FlushThreads();
5347 void DPSOFTRAST_Finish(void)
5352 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5362 memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5363 dpsoftrast.bigendian = u.b[3];
5364 dpsoftrast.fb_width = width;
5365 dpsoftrast.fb_height = height;
5366 dpsoftrast.fb_depthpixels = depthpixels;
5367 dpsoftrast.fb_colorpixels[0] = colorpixels;
5368 dpsoftrast.fb_colorpixels[1] = NULL;
5369 dpsoftrast.fb_colorpixels[1] = NULL;
5370 dpsoftrast.fb_colorpixels[1] = NULL;
5371 dpsoftrast.viewport[0] = 0;
5372 dpsoftrast.viewport[1] = 0;
5373 dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5374 dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5375 DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5376 dpsoftrast.texture_firstfree = 1;
5377 dpsoftrast.texture_end = 1;
5378 dpsoftrast.texture_max = 0;
5379 dpsoftrast.color[0] = 1;
5380 dpsoftrast.color[1] = 1;
5381 dpsoftrast.color[2] = 1;
5382 dpsoftrast.color[3] = 1;
5383 dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5384 dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5385 dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5386 dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5387 for (i = 0; i < dpsoftrast.numthreads; i++)
5389 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5391 thread->cullface = GL_BACK;
5392 thread->colormask[1] = 1;
5393 thread->colormask[2] = 1;
5394 thread->colormask[3] = 1;
5395 thread->blendfunc[0] = GL_ONE;
5396 thread->blendfunc[1] = GL_ZERO;
5397 thread->depthmask = true;
5398 thread->depthtest = true;
5399 thread->depthfunc = GL_LEQUAL;
5400 thread->scissortest = false;
5401 thread->alphatest = false;
5402 thread->alphafunc = GL_GREATER;
5403 thread->alphavalue = 0.5f;
5404 thread->viewport[0] = 0;
5405 thread->viewport[1] = 0;
5406 thread->viewport[2] = dpsoftrast.fb_width;
5407 thread->viewport[3] = dpsoftrast.fb_height;
5408 thread->scissor[0] = 0;
5409 thread->scissor[1] = 0;
5410 thread->scissor[2] = dpsoftrast.fb_width;
5411 thread->scissor[3] = dpsoftrast.fb_height;
5412 thread->depthrange[0] = 0;
5413 thread->depthrange[1] = 1;
5414 thread->polygonoffset[0] = 0;
5415 thread->polygonoffset[1] = 0;
5417 DPSOFTRAST_RecalcThread(thread);
5419 thread->numspans = 0;
5420 thread->numtriangles = 0;
5421 thread->commandoffset = 0;
5422 thread->waiting = false;
5423 thread->starving = false;
5425 thread->validate = -1;
5426 DPSOFTRAST_Validate(thread, -1);
5428 if (dpsoftrast.usethreads)
5430 thread->waitcond = Thread_CreateCond();
5431 thread->drawcond = Thread_CreateCond();
5432 thread->drawmutex = Thread_CreateMutex();
5433 thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5439 void DPSOFTRAST_Shutdown(void)
5442 if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5444 DPSOFTRAST_State_Thread *thread;
5445 for (i = 0; i < dpsoftrast.numthreads; i++)
5447 thread = &dpsoftrast.threads[i];
5448 Thread_LockMutex(thread->drawmutex);
5450 Thread_CondSignal(thread->drawcond);
5451 Thread_UnlockMutex(thread->drawmutex);
5452 Thread_WaitThread(thread->thread, 0);
5453 Thread_DestroyCond(thread->waitcond);
5454 Thread_DestroyCond(thread->drawcond);
5455 Thread_DestroyMutex(thread->drawmutex);
5458 for (i = 0;i < dpsoftrast.texture_end;i++)
5459 if (dpsoftrast.texture[i].bytes)
5460 MM_FREE(dpsoftrast.texture[i].bytes);
5461 if (dpsoftrast.texture)
5462 free(dpsoftrast.texture);
5463 if (dpsoftrast.threads)
5464 MM_FREE(dpsoftrast.threads);
5465 memset(&dpsoftrast, 0, sizeof(dpsoftrast));